In [1]:
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import GridSearchCV
import pandas as pd
import numpy as np

In [2]:
df=pd.read_csv("/kaggle/input/sentiment-analysis-imdb/train.csv")
df.head()

Unnamed: 0,review,sentiment
0,SAPS AT SEA <br /><br />Aspect ratio: 1.37:1<b...,negative
1,"If you want mindless action, hot chicks and a ...",positive
2,"""The Woman in Black"" is easily one of the cree...",positive
3,I can barely find the words to describe how mu...,negative
4,What's in here ?! Let me tell you. It's the pr...,negative


In [3]:
def train_model(model, param_grid, vectorizer, ngram_range, max_features):
    cnt_vectorizer = vectorizer(ngram_range=ngram_range, max_features=max_features)
    features = cnt_vectorizer.fit_transform(df['review'])
    features_nd = features.toarray()

    X_train, X_test, y_train, y_test = train_test_split(features_nd, df['sentiment'], train_size=0.75, random_state=1234)
    
    grid_search = GridSearchCV(model, param_grid, cv=3)
    grid_search.fit(X_train, y_train)

    y_pred = grid_search.predict(X_test)
    print("Model Name:", model.__class__.__name__, "\n", "Ngram Range:", ngram_range, "\n", "Max Features:", max_features, "\n", "Vectorizer:", vectorizer.__name__, "\n", "Accuracy:", accuracy_score(y_test, y_pred))
    print("Best parameters:", grid_search.best_params_)


In [4]:
GB=GradientBoostingClassifier()



In [None]:

param_GB = {'n_estimators': [50, 100, 150, 200], 'max_depth': [3, 5, 7], 'min_samples_split': [3, 5, 7], 'min_samples_leaf': [16, 32]}


models_params = [
    (GB, param_GB),   
]

ngram_range = [(1, 2), (1, 3)]
max_features = [500, 1000, 2000, 3000, 4000, 5000]
vectorizers = [CountVectorizer, TfidfVectorizer]


for model, model_params in models_params:
    for vec in vectorizers:
        for ngram in ngram_range:
            for max_feat in max_features:
                train_model(model, model_params, vec, ngram, max_feat)

Model Name: GradientBoostingClassifier 
 Ngram Range: (1, 2) 
 Max Features: 500 
 Vectorizer: CountVectorizer 
 Accuracy: 0.8162666666666667
Best parameters: {'max_depth': 5, 'min_samples_leaf': 32, 'min_samples_split': 3, 'n_estimators': 200}
