# Assignment module 4

This assignment covers sentiment analysis. The main goal is to train a model in Python that is able to predict the sentiment in a sentence. For this multiple combinations of vectorizers and models will be tested. 

### Package import

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
%matplotlib inline
from sklearn.metrics import ConfusionMatrixDisplay as cmd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB
import os
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

### Data import

In [2]:
# The path of the dataset
url = 'https://raw.githubusercontent.com/zhenliangma/Applied-AI-in-Transportation/master/Exercise_4_Text_classification/Pakistani%20Traffic%20sentiment%20Analysis.csv'

# Load the data use the pandas
df = pd.read_csv(url)

### Exploratory data analysis

In [3]:
df.info()

# Displaying the instances of each class
df.groupby('Sentiment').describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2109 entries, 0 to 2108
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Text       2109 non-null   object
 1   Sentiment  2109 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 33.1+ KB


Unnamed: 0_level_0,Text,Text,Text,Text
Unnamed: 0_level_1,count,unique,top,freq
Sentiment,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,1010,1008,Traffic open at shahrah e faisal,2
1,1099,1079,Road is closed for traffic at star gate toward...,3


In [4]:
# Delete the duplicate rows
df = df.drop_duplicates()

# Displaying the instances of each class
df.groupby('Sentiment').describe()

Unnamed: 0_level_0,Text,Text,Text,Text
Unnamed: 0_level_1,count,unique,top,freq
Sentiment,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,1008,1008,Adayala road is clear,1
1,1079,1079,Traffic jam from parbat rd to nazim-ud-din rd ...,1


### Creating the model

In [5]:
model_list = ["LogisticRegression", "KNeighborsClassifier", "RandomForestClassifier", "XGBClassifier", "SVC", "BernoulliNB"]
vectorizer_list = ["CountVectorizer", "HashingVectorizer", "TfidfVectorizer"]
df_results = pd.DataFrame(columns = ["Model", "Vectorize", "Optimal parameters", "Accuracy score training", "Accuracy score test"])

for i in model_list:
    for j in vectorizer_list:
        #Test all the possible vectorizers:
        if j == "CountVectorizer":
            vectorizer = CountVectorizer(ngram_range=(1, 2), stop_words='english',min_df=20)
        elif j == "HashingVectorizer":
            vectorizer = HashingVectorizer(ngram_range=(1, 2), n_features=200)
        elif j == "TfidfVectorizer":
            vectorizer = TfidfVectorizer(min_df=20,norm='l2',smooth_idf=True,use_idf=True,ngram_range=(1, 1),stop_words='english')
        else:
            print("Error: Vectorizer not assigned correctly")
        
        #Set the vectorizer and split the data in a train and test set:
        x = vectorizer.fit_transform(df['Text'])
        y = df['Sentiment']
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2,random_state=0)
        
        #Test all the possible model types:
        if i == "LogisticRegression":
            model = LogisticRegression(max_iter=1000, random_state=0)
            param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}
        elif i == "KNeighborsClassifier":
            model=KNeighborsClassifier()
            param_grid = {'n_neighbors': [3, 5, 7, 9], 
                          'weights': ['uniform', 'distance']}
        elif i == "RandomForestClassifier":
            model = RandomForestClassifier(random_state=0)
            param_grid = {'n_estimators': [100, 200, 300], 
                          'max_depth': [None, 10, 20, 30], 
                          'min_samples_split': [2, 5, 10], 
                          'min_samples_leaf': [1, 2, 4]}
        elif i == "XGBClassifier":
            model =  XGBClassifier()
            param_grid = {'learning_rate': [0.01, 0.1, 0.2],
                          'n_estimators': [100, 200, 300],
                          'max_depth': [3, 4, 5]}
        elif i == "SVC":
            model= SVC(probability=True)
            param_grid = {'kernel': ['linear', 'rbf', 'poly'],
                          'C': [0.1, 1, 10]}
        elif i == "BernoulliNB":
            model=BernoulliNB()
            param_grid = {'alpha': [0.1, 0.5, 1],
                          'force_alpha': [True,False]}
        else:
            print("Error: Model not assigned correctly")
            
        
        #Perform gridsearch with 5 fold cross-validtion to maximize the accuracy for the model-vectorizer combination:
        grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy')

        #Fit the model to the training set for each combination in the gridsearch:
        grid_search.fit(x_train, y_train)

        #Find best parameters and corresponding accuracy score:
        best_params = grid_search.best_params_
        best_score = grid_search.best_score_
        
        #Run the model-vectorizer combination on the test set for the best parameter combination
        model = grid_search.best_estimator_
        
        
        #Add the results to the results dataframe
        df_results = df_results.append({"Model" : i, "Vectorizer" : j, "Optimal parameters" : best_params, 
                                        "Accuracy score training" : best_score, 
                                        "Accuracy score test" : accuracy_score(y_test,model.predict(x_test))}, 
                                       ignore_index = True)
        
print(df_results)

  df_results = df_results.append({"Model" : i, "Vectorizer" : j, "Optimal parameters" : best_params,
  df_results = df_results.append({"Model" : i, "Vectorizer" : j, "Optimal parameters" : best_params,
  df_results = df_results.append({"Model" : i, "Vectorizer" : j, "Optimal parameters" : best_params,
  df_results = df_results.append({"Model" : i, "Vectorizer" : j, "Optimal parameters" : best_params,
  df_results = df_results.append({"Model" : i, "Vectorizer" : j, "Optimal parameters" : best_params,
  df_results = df_results.append({"Model" : i, "Vectorizer" : j, "Optimal parameters" : best_params,
  df_results = df_results.append({"Model" : i, "Vectorizer" : j, "Optimal parameters" : best_params,
  df_results = df_results.append({"Model" : i, "Vectorizer" : j, "Optimal parameters" : best_params,
  df_results = df_results.append({"Model" : i, "Vectorizer" : j, "Optimal parameters" : best_params,
  df_results = df_results.append({"Model" : i, "Vectorizer" : j, "Optimal parameters" : bes

                     Model Vectorize  \
0       LogisticRegression       NaN   
1       LogisticRegression       NaN   
2       LogisticRegression       NaN   
3     KNeighborsClassifier       NaN   
4     KNeighborsClassifier       NaN   
5     KNeighborsClassifier       NaN   
6   RandomForestClassifier       NaN   
7   RandomForestClassifier       NaN   
8   RandomForestClassifier       NaN   
9            XGBClassifier       NaN   
10           XGBClassifier       NaN   
11           XGBClassifier       NaN   
12                     SVC       NaN   
13                     SVC       NaN   
14                     SVC       NaN   
15             BernoulliNB       NaN   
16             BernoulliNB       NaN   
17             BernoulliNB       NaN   

                                   Optimal parameters  \
0                                            {'C': 1}   
1                                           {'C': 10}   
2                                          {'C': 0.1}   
3          

  df_results = df_results.append({"Model" : i, "Vectorizer" : j, "Optimal parameters" : best_params,
  df_results = df_results.append({"Model" : i, "Vectorizer" : j, "Optimal parameters" : best_params,
