In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## important imports

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler,OrdinalEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split,KFold,GridSearchCV,cross_val_score
from sklearn.metrics import confusion_matrix, classification_report,f1_score,accuracy_score 
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.dummy import DummyClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline


## Data Loading 

In [None]:
movie=pd.read_csv("/kaggle/input/sentiment-prediction-on-movie-reviews/movies.csv")
movie.head()

In [None]:
train=pd.read_csv("/kaggle/input/sentiment-prediction-on-movie-reviews/train.csv" )
train.head()

In [None]:
#load test data
test=pd.read_csv("/kaggle/input/sentiment-prediction-on-movie-reviews/test.csv")
test.head()

## Dataset Description

### Predict sentiment of movie reviews.
#### Columns :
          - movieid : named id of the movie
          - sentiment : indicating "POSITIVE" or "NEGATIVE", which is the target.
          - reviewText: reviews about movies 
          
#####    * It is a Supervised Classification problem.

 ##### * F1-score as performance measure

## merge Train and Movie data

In [None]:
movie=movie.drop_duplicates(subset='movieid')  

In [None]:
merge_train=pd.merge(train,movie,on='movieid')
merge_test=pd.merge(test,movie,on='movieid')

In [None]:
merge_train.shape

In [None]:
merge_test.shape

In [None]:
merge_train.info()

### Data Cleaning

In [None]:
merge_train.isnull().sum()

In [None]:
print('Percent of missing values per feature: ') 
merge_train.isnull().sum() * 100 / len(merge_train)

In [None]:
merge_train['rating'].value_counts()

In [None]:
merge_test.columns

**droping those rows which have large number of nan values and is not much correlated with sentiment.**

In [None]:

merge_train=merge_train.drop(['movieid','reviewerName','soundType','ratingContents','boxOffice','releaseDateTheaters','releaseDateStreaming','runtimeMinutes','distributor'],axis=1)
merge_test=merge_test.drop(['movieid','reviewerName','soundType','ratingContents','boxOffice','releaseDateTheaters','releaseDateStreaming','runtimeMinutes','distributor'],axis=1)

In [None]:
merge_train.shape ,merge_test.shape

In [None]:
merge_train.nunique()

### **Imputation**

In [None]:
merge_train['audienceScore'].value_counts()


In [None]:
#feel that audienceScore may be a useful feature so let's fill nan values
#for now use mean 
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(strategy='mean')
imp_mean.fit(merge_train[['audienceScore']])
merge_train[['audienceScore']]=imp_mean.transform(merge_train[['audienceScore']])

### Convert test to numeric

In [None]:
#encode some categorical features 
ordinal_encoder = OrdinalEncoder()
merge_train['isFrequentReviewer'] = ordinal_encoder.fit_transform(merge_train['isFrequentReviewer'])
merge_train['sentiment'] = ordinal_encoder.fit_transform(merge_train['sentiment'])
merge_train['originalLanguage'] = ordinal_encoder.fit_transform(merge_train['originalLanguage'])
# merge_train['director'] = ordinal_encoder.fit_transform(merge_train['director'])
merge_train['rating'] = ordinal_encoder.fit_transform(merge_train['rating'])
# merge_train['title'] = ordinal_encoder.fit_transform(merge_train['title'])
merge_train['genre'] = ordinal_encoder.fit_transform(merge_train['genre'])

**check the relationship between features**

In [None]:

merge_train.corr()

## Data Exploration

In [None]:
movie.shape , train.shape ,test.shape

In [None]:
movie.info() 

In [None]:
 train.info()

* Total entries: 162758 
* There are total 5 columns: 4 features + 1 label
  * Label column: `sentiment`
  * Features: `[movieid, reviewerName, isFrequentReviewer , reviewText]`

In [None]:
train.describe()

In [None]:
train['sentiment'].unique()

In [None]:
train['sentiment'].value_counts()

#sentiment is roughtly balanced so we can use it as is

In [None]:
train['isFrequentReviewer'].value_counts()

In [None]:
train.isna().sum()

#only ReviewText have some nan values

In [None]:
train.nunique()

In [None]:
##statistics of data
merge_train.describe()



there are some ouliers in some of the categorical/text feature and in Audience score 

#### **Handling Outliers**

**Very less or zero outliers present in: 'genre','rating','director','title','sentiment','isFrequentReviewer'**

In [None]:
## box plot
# audienceScore have some outliers
sns.boxplot(x=merge_train['audienceScore'])

there are some outliers between **0 to 10**

Ways to deal with outliers :

  * remove all the outliers
  * kept/convert them to some value
  * make bins then deal with it(used below)

In [None]:
# Data points greater than the first_qurtile or less than the third_qurtile are outliers
first_qurtile = merge_train['audienceScore'].quantile(q = 0.25)
third_qurtile = merge_train['audienceScore'].quantile(q = 0.75)
IQR = third_qurtile- first_qurtile
IQR

In [None]:
merge_train['audienceScore'].value_counts()

In [None]:
merge_train.loc[merge_train['audienceScore']<= first_qurtile, 'audienceScore'] = 1
merge_train.loc[(merge_train['audienceScore']> first_qurtile) & (merge_train['audienceScore']<=third_qurtile), 'audienceScore'] = 2
merge_train.loc[merge_train['audienceScore']> third_qurtile, 'audienceScore'] = 3


In [None]:
merge_train['audienceScore'].value_counts()

In [None]:
# contingency table
pd.crosstab(merge_train['audienceScore'], merge_train['sentiment'])

In [None]:
sns.boxplot(x=merge_train['audienceScore'])

### Scaling Numerical features 

In [None]:
scaler = MinMaxScaler()

# define the columns to be scaled
cols_to_scale = ['rating','originalLanguage']
merge_train[[cols_to_scale]] = scaler.fit_transform(merge_train[[cols_to_scale]])


### Data Visualization

In [None]:
plt.pie(train['sentiment'].value_counts(),labels=['POSITIVE','NEGATIVE'])
plt.title("sentiment anaylysis")

##### *There are 66.8% sentiments are POSITIVE and 33.2% sentiments are NEGATIVE*

In [None]:
plt.bar(train['sentiment'].unique(),train['sentiment'].value_counts())
plt.title("sentiment anaylysis")


### final clean

**Here we use only train dataset**

- **movieid and reviwerName not seems important for sentiment analysis**

- **And isFrequentReviewer is very less correlated with sentiment as we see above
  therefore We droped these columns**

In [None]:
traindf=train.drop(['movieid','reviewerName','isFrequentReviewer'],axis=1)

In [None]:
traindf.isna().sum()

In [None]:
#fill review text with ''
traindf=traindf.fillna(' ')

In [None]:
traindf.isna().sum()

In [None]:
x=traindf['reviewText']
y=traindf['sentiment']

In [None]:
x.shape,y.shape

In [None]:
#splitting the data into train and test set
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

###  Handling text

**finding best ngram for count vectorizer with the best moddel**

In [None]:
# from sklearn.feature_extraction.text import CountVectorizer

# pipeline = Pipeline([
#     ('count', CountVectorizer()),
#     ('log', LogisticRegression())  # Replace with your desired classifier
# ])

# param = {
#     'count__ngram_range': [(1, 1), (1, 2), (1, 3)]  # Experiment with different ngram_range values
# }

# grid = GridSearchCV(pipeline, param_grid=param, cv=3, scoring='f1_micro')
# grid.fit(x_train, y_train)

In [None]:
# print("Best Parameter:", grid.best_params_)
# print("Best F1 Score:", grid.best_score_)
# # Best Parameter: {'vectorizer__ngram_range': (1, 3)}
# # Best F1 Score: 0.810239159485738

In [None]:
# # Evaluate the best model on the test set
# best_model = grid_search.best_estimator_
# y_pred = best_model.predict(x_test)
# f1_micro = f1_score(y_test, y_pred, average='micro')
# print("Test Score:", f1_micro)
# # Test Score: 0.8147272057016466


In [None]:
# vectorize the data(convert text into numeric)
# Create an instance of CountVectorizer with n-grams
vector=CountVectorizer(ngram_range=(1, 3))
vec_x_train=vector.fit_transform(x_train)
vec_x_test=vector.transform(x_test)

In [None]:
type(vec_x_train)

In [None]:
# vec_x_train.shape, vec_x_test.shape

In [None]:
y_train.shape

###  baseline model (dummy classifier)

*0.6674423605671014*

In [None]:
#baseline model (dummy classifier)
from sklearn.metrics import f1_score,accuracy_score
from sklearn.dummy import DummyClassifier
dummy = DummyClassifier(random_state=42)
dummy.fit(vec_x_train,y_train)
f1_score(y_train,dummy.predict(vec_x_train),average='micro')

## Select and train ML model

### Model 1
0.9845706035052149

**we have the best accuracy for n=1.**

In [None]:
from sklearn.linear_model import LogisticRegression
model1=LogisticRegression(C=7,max_iter=1000,solver='saga')
model1.fit(vec_x_train,y_train)


In [None]:

#  prediction on test set
y_pred=model1.predict(vec_x_test)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
#just commenting
f1_score(y_train,model1.predict(vec_x_train),average='micro')
#0.9847395665330323
#n=3 0.9848163679093129
#c=6 0.984824048046941

In [None]:

# # check accuracy_score
acc=accuracy_score(y_test,y_pred)
print(acc)
##default:0.7939297124600639
##best params :0.8126382403538953
#n=3 0.8144814450724994
#c=6  0.8137748832637012

***Hyperparameter tuning***

In [None]:

# param_grid = [ {   
# #      {'penalty' : ['l1','l2'],
#      'C' : [4,5,6,7,8],
# #     'solver' : ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga'],
# #      'max_iter' : [1000]
#      }
#  ]

In [None]:
# clf = GridSearchCV(model1, param_grid = param_grid, cv = 3, verbose=True, n_jobs=-1)
# best_clf = clf.fit(vec_x_train,y_train)

In [None]:
# best_clf.best_estimator_
#penalty=l1,solver=saga,C=7

In [None]:
# print (f'Accuracy - : {best_clf.score(vec_x_train,y_train):.3f}')
##0.847
#best params :0.985

### **Model 2 (SGDClassifier)**
0.9806614134525291

In [None]:
from sklearn.linear_model import SGDClassifier
sgd=SGDClassifier(alpha= 0.000001,loss='log',penalty= 'l2')
sgd.fit(vec_x_train,y_train)

In [None]:
y_sgd_pred=sgd.predict(vec_x_test)

In [None]:
f1_score(y_train,sgd.predict(vec_x_train),average='micro')
#0.9806614134525291

In [None]:
acc=accuracy_score(y_test,y_sgd_pred)
print(acc)
# 0.8119009584664537

In [None]:
confusion_matrix(y_test, y_pred)

#### **Hyperparameter Tunning**

In [None]:
# grid = {
#     "loss" : ["hinge", "log", "squared_hinge", "modified_huber"],
#     "alpha" : [0.001,0.0001,0.00001,0.000001],
#     "penalty" : ["l2", "l1"],
# }

In [None]:
# clf = GridSearchCV(sgd, param_grid = grid, cv = 3, verbose=True, n_jobs=-1)
# best_clf = clf.fit(vec_x_train,y_train)

In [None]:
# best_clf.best_estimator_

In [None]:
# best_clf.best_params_
#{'alpha': 0.001, 'loss': 'modified_huber', 'max_iter': 1000, 'penalty': 'l2'}
# {'alpha': 0.001, 'loss': 'log', 'max_iter': 1000, 'penalty': 'l2'}

In [None]:
# print (f'Accuracy - : {best_clf.score(vec_x_train,y_train):.3f}')
# Accuracy - : 0.909

### **MODEL 3**
0.9690183248083806)


In [None]:
from sklearn.svm import LinearSVC

In [None]:
lsvc = LinearSVC(C= 0.1,class_weight= None,dual= True,fit_intercept= True,intercept_scaling=1,loss= 'hinge',max_iter= 1000,multi_class= "ovr",penalty= 'l2',random_state=None,tol= 0.0001,verbose= 0)
lsvc.fit(vec_x_train,y_train)
y_pred_svc = lsvc.predict(vec_x_test)


In [None]:
print(classification_report(y_test, y_pred_svc))

In [None]:
confusion_matrix(y_test, y_pred_svc)

In [None]:
f1_score(y_train,lsvc.predict(vec_x_train),average='micro')
# 0.9174846013240557
#best paras:0.8490315346451008

In [None]:
acc=accuracy_score(y_test,y_pred_svc)
print(acc)
# 0.7939297124600639
##best_params:0.7944519537970017

#### hyperparameter tuning

In [None]:
# param_grid={'C': [0.1],'loss': ['hinge']}
# param_grid = {
#     'C' : np.logspace(-3, 3, 10),
#     'loss': ['hinge'],#,'squared_hinge' by defaukt
#     'max_iter': [100]
# }
# grid = GridSearchCV(lsvc,param_grid)

In [None]:
# grid.fit(vec_x_train,y_train)

In [None]:
#Calculating the accuracy of tuned model
# grid_svc = grid.predict(vec_x_train)
# accuracy_score(y_train,grid_svc)
##best params:0.8490315346451008

In [None]:
#Classification report for the tuned model
# print(classification_report(y_train,grid_svc))

In [None]:
# print(grid.best_estimator_.get_params())
##{'C': 0.1, 'class_weight': None, 'dual': True, 'fit_intercept': True, 'intercept_scaling': 1, 'loss': 'hinge', 'max_iter': 1000, 'multi_class': 'ovr', 'penalty': 'l2', 'random_state': None, 'tol': 0.0001, 'verbose': 0}

#### Model 4

In [None]:
from sklearn.naive_bayes import MultinomialNB
nb=MultinomialNB(alpha=0.01)
nb.fit(vec_x_train,y_train)

In [None]:
#  prediction on test set
y__pred=nb.predict(vec_x_test)

In [None]:
f1_score(y_train,nb.predict(vec_x_train),average='micro')
# default:0.9516612137689507
#best_params:0.9801161236809364

In [None]:
acc=accuracy_score(y_test,y__pred)
print(acc)
# #defaukt:0.793407471123126
#0.782225362496928

In [None]:
# param_grid = [    
#     {'penalty' : ['l1'],
#     'C' : [1,10,100,1000],
#     'solver' : ['saga'],
#     'max_iter' : [100]
#     }
# ]
# para=[{'alpha': [0.00001, 0.0001, 0.001,0.01],
#       }]

In [None]:
# clf = GridSearchCV(nb, param_grid = para, cv = 3, verbose=True, n_jobs=-1)
# best_cl = clf.fit(vec_x_train,y_train)

In [None]:
# best_cl.best_params_

In [None]:
# print (f'Accuracy - : {best_cl.score(vec_x_test,y_test):.3f}')

## ---------------------------------------------------------------

#### **MODEL 4(DecisionTreeClassification** 
**(0.68852)**

In [None]:
# from sklearn.tree import DecisionTreeClassifier

In [None]:
# # commenting
# # Create Decision Tree classifer object
# clf = DecisionTreeClassifier()

# # Train Decision Tree Classifer
# clf.fit(vec_x_train,y_train)

# # Predict the response for test dataset
# y_clfpred = clf.predict(vec_x_test)

In [None]:
# commenting
# f1_score(y_train,clf.predict(vec_x_train),average='micro')

In [None]:
# confusion_matrix(y_test, y_clfpred)

In [None]:
# print(classification_report(y_test, y_clfpred))

In [None]:
# # #check accuracy_score
# acc1=accuracy_score(y_test,y_clfpred)
# print(acc1)

In [None]:
# from sklearn import tree
# fig = plt.figure(figsize=(25,20))
# _ = tree.plot_tree(clf,
#                    feature_names=train['sentiment'],
#                    class_names=['NEGATIVE', "POSITIVE"],
#                    filled=True)

**Hyperparameter**

In [None]:
# from sklearn.model_selection import GridSearchCV
# params = {'max_depth':[3,5,7,10,15],
#           'min_samples_leaf':[3,5,10,15,20],
# #           'min_samples_split':[8,10,12,18,20,16],
#           'criterion':['gini','entropy']}
# GS = GridSearchCV(estimator=clf,param_grid=params,n_jobs=-1, verbose=True)

In [None]:
# GS.fit(vec_x_train,y_train)

In [None]:
#just commenting
# # evaluate a logistic regression model using k-fold cross-validation
# from sklearn.model_selection import KFold
# from sklearn.model_selection import cross_val_score
# kf = KFold(n_splits = 6, random_state=1, shuffle=True)  
# # evaluate model
# score = cross_val_score(model1, vec_x_train,y_train, cv=kf)  
  
# # Printing accuracy scores/report performance 
# print("K-fold Cross Validation Scores are: ", score)  
# print("Mean Cross Validation score is: ", score.mean())

In [None]:
#tuning hyperparameter
# from sklearn.model_selection import GridSearchCV

In [None]:
# #commenting
# # Necessary imports
# from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import GridSearchCV
 
# # Creating the hyperparameter grid
# c_space = np.logspace(-5, 8, 15)
# param_grid = {'C': c_space}
 
# # Instantiating logistic regression classifier
# logreg = LogisticRegression()
 
# # Instantiating the GridSearchCV object
# logreg_cv = GridSearchCV(logreg, param_grid, cv = 5)
 
# logreg_cv.fit(vec_x_train,y_train)
 
# # Print the tuned parameters and score
# print("Tuned Logistic Regression Parameters: {}".format(logreg_cv.best_params_))
# print("Best score is {}".format(logreg_cv.best_score_))

In [None]:
# #just commenting
# # Necessary imports
# from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import GridSearchCV
 
# # Creating the hyperparameter grid
# # c_space = 0.44
# param_grid = {'C': [0.440,0.430,0.439]}
 
# # Instantiating logistic regression classifier
# logreg = LogisticRegression()
 
# # Instantiating the GridSearchCV object
# logreg_cv = GridSearchCV(logreg, param_grid, cv = 5)
 
# logreg_cv.fit(vec_x_train,y_train)

In [None]:
# logreg_cv.best_params_

In [None]:

# from sklearn.metrics import  precision_score
# logreg = LogisticRegression(penalty='none')
# logreg.fit(vec_x_train,y_train)

# y_pred = logreg.predict(vec_x_test)

# print('Accuracy of logistic regression classifier on train set: {:.2f}'.format(logreg.score(vec_x_train,y_train)))
# print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(vec_x_test, y_test)))


## KNN
(public score: 0.66855)

In [None]:
# # just commenting
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(metric= 'euclidean', n_neighbors=19, weights='uniform')


In [None]:
# from sklearn import metrics

In [None]:
# just commenting
# Training the model.
knn.fit(vec_x_train,y_train)
#Predict test data set.
# y_pred= knn.predict(vec_x_train)
y_pred1= knn.predict(vec_x_test)

In [None]:
#just commenting
# y_test.shape , y_pred1.shape

In [None]:
# just commenting
# #Checking performance our model with ROC Score.
acc=accuracy_score(y_test,y_pred1)
print("accuracy of test:",acc)
#acc1=accuracy_score(y_train,y_pred)
#print("accuracy of test:",acc1)
##accuracy of test: 0.6437392479724748
##best params:0.6689911526173506

In [None]:
# confusion_matrix(y_test, y_pred1)

In [None]:
# print(classification_report(y_test, y_pred1))

**Hyperparameter tuning**

In [None]:
# from sklearn.model_selection import GridSearchCV

In [None]:
# grid_params = { 'n_neighbors' : list(range(5,20)),
#                'weights' : ['uniform'],
#                'metric' : ['euclidean','manhattan']}

In [None]:
#just commenting
# grid_params = { 'n_neighbors' : [11,12,13,14,15],
#                'weights' : ['uniform'],
#                'metric' : ['euclidean']}

In [None]:
# just commenting
# gs = GridSearchCV(KNeighborsClassifier(), grid_params, verbose = 1, cv=3)

In [None]:
#just commenting
# fit the model on our train set
#Fitting 3 folds for each of 30 candidates, totalling 90 fits
# g_res = gs.fit(vec_x_test,y_test)

In [None]:
#just commenting
# find the best score
# g_res.best_score_
##0.6496065614939749

In [None]:
#just commenting
# get the hyperparameters with the best score
# g_res.best_params_
##{'metric': 'euclidean', 'n_neighbors': 19, 'weights': 'uniform'}

In [None]:
# use the best hyperparameters
# knn = KNeighborsClassifier(n_neighbors = 5, weights = 'uniform',algorithm = 'brute',metric = 'minkowski')
# knn.fit(vec_x_train,y_train)

In [None]:
#just commenting
# use the best hyperparameters
# knn = KNeighborsClassifier(n_neighbors = 15, weights = 'uniform',metric = 'euclidean')
# knn.fit(vec_x_train,y_train)

In [None]:
#just commenting
# get a prediction
# y_hat = knn.predict(vec_x_train)
# y_knn = knn.predict(vec_x_test)

KNN Model evaluation

In [None]:
#just commenting
# print('Training set accuracy: ', metrics.accuracy_score(y_train, y_hat))
# print('Test set accuracy: ',metrics.accuracy_score(y_test, y_knn))
# #scores of case1 before tuning
#Training set accuracy:  0.7571310077876595
# Test set accuracy:  0.6437392479724748

In [None]:
#just commenting
# from sklearn.metrics import classification_report
# print(classification_report(y_test, y_knn))
# confusion_matrix(y_test, y_knn)

In [None]:
#just commenting
# from sklearn.model_selection import cross_val_score
# scores = cross_val_score(knn, vec_x_train,y_train, cv =5)

In [None]:
#just commenting
# print('Model accuracy: ',np.mean(scores))

**Bagging Algo**

In [None]:
#commenting
# from sklearn.ensemble import BaggingClassifier

In [None]:
# #commenting
# model_bc = BaggingClassifier()
# model_bc.fit(vec_x_train,y_train)
# bc_y_pred=model_bc.predict(vec_x_test)

In [None]:
#commenting
# f1_score(y_train,model_bc.predict(vec_x_train),average='micro')

In [None]:
# confusion_matrix(y_test, bc_y_pred)

In [None]:
# acc2=accuracy_score(y_test,bc_y_pred)
# print(acc2)

**Boosting algo**

In [None]:
#commenting
# from sklearn.ensemble import GradientBoostingClassifier

In [None]:
#commenting
# gb_clf_model = GradientBoostingClassifier()
# gb_clf_model.fit(vec_x_train , y_train)
# gb_y_pred=gb_clf_model.predict(vec_x_test)

In [None]:
#commenting
# f1_score(y_train,gb_clf_model.predict(vec_x_train),average='micro')

In [None]:
# acc3=accuracy_score(y_test,gb_y_pred)
# print(acc3)

**Multi Layer Perceptron algorithm**

In [None]:
#commenting
# from sklearn.neural_network import MLPClassifier

In [None]:
#commenting
# mlp_model=MLPClassifier()
# mlp_model.fit(vec_x_train,y_train)
# mlp_y_pred=mlp_model.predict(vec_x_test)

In [None]:
#commenting
# f1_score(y_train,mlp_model.predict(vec_x_train),average='micro')

In [None]:
# acc4=accuracy_score(y_test,mlp_y_pred)
# print(acc4)

## Comparing Models

In [None]:
models=["LogisticRegression","SGDClassifier","LinearSVC","KNearestNeighbour"]
classifier=[
    LogisticRegression(C=7,max_iter=1000,solver='saga'),
    SGDClassifier(alpha= 0.000001,loss='log',penalty= 'l2'),
    LinearSVC(C= 0.1,class_weight= None,dual= True,fit_intercept= True,intercept_scaling=1,loss= 'hinge',max_iter= 1000,multi_class= "ovr",penalty= 'l2',random_state=None,tol= 0.0001,verbose= 0),
    KNeighborsClassifier(metric= 'euclidean', n_neighbors=19, weights='uniform'),
#     MultinomialNB(alpha=0.01)
    
]

In [None]:
scores=[]
for name,model in zip(models,classifier):
    model.fit(vec_x_train,y_train)
    y_pred= model.predict(vec_x_test)
    score=f1_score(y_train,model.predict(vec_x_train),average='micro')
    scores.append(score)

**LogisticRegression, linearSvm and SGDRegression perform somewhat similar.**

In [None]:
df=pd.DataFrame()
df['models']=models
df['score']=scores
df

In [None]:
plt.bar('models','score',data=df)

### roc_curve

In [None]:
# predict probabilities
pred_prob1 = model1.predict_proba(vec_x_test)
pred_prob2 = sgd.predict_proba(vec_x_test)
pred_prob3 = lsvc._predict_proba_lr(vec_x_test)
pred_prob4 = nb.predict_proba(vec_x_test)
pred_prob5 = knn.predict_proba(vec_x_test)


In [None]:
from sklearn.metrics import roc_curve

# roc curve for models
fpr_1, tpr_1, thres1 = roc_curve(y_test, pred_prob1[:,1], pos_label='POSITIVE')
fpr_2, tpr_2, thres2 = roc_curve(y_test, pred_prob2[:,1], pos_label='POSITIVE')
fpr_3, tpr_3, thres3 = roc_curve(y_test, pred_prob3[:,1], pos_label='POSITIVE')
fpr_4, tpr_4, thres4 = roc_curve(y_test, pred_prob4[:,1], pos_label='POSITIVE')
fpr_5, tpr_5, thres5 = roc_curve(y_test, pred_prob5[:,1], pos_label='POSITIVE')

# roc curve for tpr = fpr 
random_probs = [0 for i in range(len(y_test))]
p_fpr, p_tpr, _ = roc_curve(y_test, random_probs, pos_label='POSITIVE')

In [None]:
# matplotlib
import matplotlib.pyplot as plt
# plt.style.use('seaborn')

# plot roc curves
plt.plot(fpr_1, tpr_1, linestyle='--',color='red', label='Logistic Regression')
plt.plot(fpr_2, tpr_2, linestyle='--',color='blue', label='SGDClassifier')
plt.plot(fpr_3, tpr_3, linestyle='--',color='green', label='LinearSVC')
plt.plot(fpr_5, tpr_5, linestyle='--',color='orange', label='KNeighborsClassifier')
plt.plot(fpr_4, tpr_4, linestyle='--',color='pink', label='MultinomialNB')
plt.plot(p_fpr, p_tpr, linestyle='--', color='black')

plt.title('ROC curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive rate')
plt.legend(loc='best')
plt.savefig('ROC',dpi=10)
plt.show();

***here we observe that KNN model performs the worst , then MultinomialNB***

***and performance of other three models is similar***

# Test 

In [None]:
#load test data
test=pd.read_csv("/kaggle/input/sentiment-prediction-on-movie-reviews/test.csv")
test.head()

### Preprocessing on test data

In [None]:
testdf=test.drop(['movieid','reviewerName','isTopCritic'],axis=1)

In [None]:
#replacing null values
testdf=testdf.fillna(' ')

In [None]:
testdf

### Submission Using the Best Model

In [None]:
# vectorize the test set
vec_x_test=vector.transform(testdf['reviewText'])

#make prediction on test data
test_pred=model1.predict(vec_x_test)

In [None]:
test_pred

In [None]:
#submission
submission=pd.DataFrame(columns=['id','sentiment'])
submission['id']=[i for i in range(len(test_pred))]
submission['sentiment']=test_pred
submission.to_csv('submission.csv',index=False)

In [None]:
submission.shape

In [None]:
submission