<a href="https://colab.research.google.com/github/Ssactl/AH2179_Applied-AI-in-Transportation/blob/main/Module4_Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
%matplotlib inline
from sklearn.metrics import ConfusionMatrixDisplay as cmd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB
import os
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score

Load the data

In [3]:
# The path of the dataset
url = 'https://raw.githubusercontent.com/zhenliangma/Applied-AI-in-Transportation/master/Exercise_4_Text_classification/Pakistani%20Traffic%20sentiment%20Analysis.csv'

# Load the data use the pandas
df = pd.read_csv(url)

# Display the data
df.head()

Unnamed: 0,Text,Sentiment
0,Adayala road is clear,0
1,Traffic jam from parbat rd to nazim-ud-din rd ...,1
2,Mandra is clear,0
3,Fort street is clear,0
4,"Mashriq Hotel towards Fawara Chowk, City Sadda...",1


In [4]:
# Delete the duplicate rows
df = df.drop_duplicates()

# Displaying the instances of each class
df.groupby('Sentiment').describe()

Unnamed: 0_level_0,Text,Text,Text,Text
Unnamed: 0_level_1,count,unique,top,freq
Sentiment,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,1008,1008,Adayala road is clear,1
1,1079,1079,Traffic jam from parbat rd to nazim-ud-din rd ...,1


Vectorize the data

In [5]:
import re
# The preprocess_text function uses the re.sub() method from Python's regular expression library (re) to remove all digits from the text and convert it to lowercase.
def preprocess_text(text):
    return re.sub(r'\d+', '', text).lower()

In [6]:
# CountVectorizer
#the ngram_range represents the length range of phrase segmentation,the stop_words parameter is set to "english" to remove common English stop words like "and", "the", and "a" ,min_df is to serve as a threshold to delete some words.
#and the additional preprocessor parameter set to preprocess_text
c_vectorizer = CountVectorizer(ngram_range=(1, 2), stop_words='english',min_df=20,preprocessor=preprocess_text)

In [7]:
# HashingVectorizer
h_vectorizer = HashingVectorizer(ngram_range=(1, 2), stop_words='english',preprocessor=preprocess_text)

In [8]:
# TfidfVectorizer
t_vectorizer = TfidfVectorizer(min_df=20,ngram_range=(1, 2),stop_words='english',preprocessor=preprocess_text)

Split the dataset

In [9]:
c_x = c_vectorizer.fit_transform(df['Text'])
h_x = h_vectorizer.fit_transform(df['Text'])
t_x = t_vectorizer.fit_transform(df['Text'])
y = df['Sentiment']

In [10]:
c_x_train, c_x_test, y_train, y_test = train_test_split(c_x, y, test_size=0.2,random_state=0)
h_x_train, h_x_test, y_train, y_test = train_test_split(h_x, y, test_size=0.2,random_state=0)
t_x_train, t_x_test, y_train, y_test = train_test_split(t_x, y, test_size=0.2,random_state=0)


train the model

In [13]:
def model_training(vect_method,x_train,x_test,y_train,y_test):

    print('----------------------')
    print('----------------------')
    print(vect_method)
    print('----------------------')


    # 1.logistic regression
    print('----------------------')
    print('Logistic regression : ')
    LR_params = {
        'C': [0.001, 0.01, 0.1, 1, 10, 100],
    };
    LR_grid_search = GridSearchCV(LogisticRegression(max_iter=1000), LR_params, cv=5, scoring='accuracy')
    LR_grid_search.fit(x_train, y_train)
    LR_best_params = LR_grid_search.best_params_
    print(LR_best_params)
    # Retrieve the best cross-validated accuracy score achieved with the best hyperparameters
    print(f'Accuracy train: {LR_grid_search.best_score_}')
    print(f'Accuracy test: {accuracy_score(y_test,LR_grid_search.best_estimator_.predict(x_test))}')

    # 2.KNN
    print('----------------------')
    print('KNN : ')
    KNN_params = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance']
    }
    KNN_grid_search = GridSearchCV(KNeighborsClassifier(), KNN_params, cv=5, scoring='accuracy')
    KNN_grid_search.fit(x_train, y_train)
    KNN_best_params = KNN_grid_search.best_params_
    print(KNN_best_params)
    # Retrieve the best cross-validated accuracy score achieved with the best hyperparameters
    print(f'Accuracy train: {KNN_grid_search.best_score_}')
    print(f'Accuracy test: {accuracy_score(y_test,KNN_grid_search.best_estimator_.predict(x_test))}')

    # 3.RF
    print('----------------------')
    print('Random foreest : ')
    RF_params = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
    }
    RF_grid_search = GridSearchCV(RandomForestClassifier(random_state=0), RF_params, cv=5, scoring='accuracy')
    RF_grid_search.fit(x_train, y_train)
    RF_best_params = RF_grid_search.best_params_
    print(RF_best_params)
    print(f'Accuracy train: {RF_grid_search.best_score_}')
    print(f'Accuracy test: {accuracy_score(y_test,RF_grid_search.best_estimator_.predict(x_test))}')

    # 4.XGBoost
    print('----------------------')
    print('XGBoost : ')
    XGBoost_params = {
      'learning_rate': [0.01, 0.1, 0.2],
      'n_estimators': [100, 200],
      'max_depth': [3, 4, 5]
    }
    XGBoost_grid_search = GridSearchCV(XGBClassifier(), XGBoost_params, cv=5, scoring='accuracy')
    XGBoost_grid_search.fit(x_train, y_train)
    XGBoost_best_params = XGBoost_grid_search.best_params_
    print(XGBoost_best_params)
    print(f'Accuracy train: {XGBoost_grid_search.best_score_}')
    print(f'Accuracy test: {accuracy_score(y_test,XGBoost_grid_search.best_estimator_.predict(x_test))}')

    # 5.SVM
    print('----------------------')
    print('SVM : ')

    SVM_params = {
      'C': [0.1, 1, 10],
      'kernel': ['linear', 'poly', 'rbf'],
    }
    SVM_grid_search = GridSearchCV(SVC(), SVM_params, cv=5, scoring='accuracy')
    SVM_grid_search.fit(x_train, y_train)
    SVM_best_params = SVM_grid_search.best_params_
    print(SVM_best_params)
    print(f'Accuracy train: {SVM_grid_search.best_score_}')
    print(f'Accuracy test: {accuracy_score(y_test,SVM_grid_search.best_estimator_.predict(x_test))}')

    #6 Naïve Bayes models
    print('----------------------')
    print('Naïve Bayes : ')
    NB_param = {'alpha': [0.1, 0.5, 1],'force_alpha': [True,False]}
    NB_grid_search = GridSearchCV(BernoulliNB(), NB_param, cv=5, scoring='accuracy')
    NB_grid_search.fit(x_train, y_train)
    print(NB_grid_search.best_params_)
    print(f'Accuracy train: {NB_grid_search.best_score_}')
    print(f'Accuracy test: {accuracy_score(y_test,NB_grid_search.best_estimator_.predict(x_test))}')

In [15]:
model_training('CountVectorizer',c_x_train, c_x_test, y_train, y_test)

----------------------
----------------------
CountVectorizer
----------------------
----------------------
Logistic regression : 
{'C': 1}
Accuracy train: 0.950869432306558
Accuracy test: 0.937799043062201
----------------------
KNN : 
{'n_neighbors': 7, 'weights': 'distance'}
Accuracy train: 0.9311035586484688
Accuracy test: 0.9593301435406698
----------------------
Random foreest : 
{'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Accuracy train: 0.9562604520688354
Accuracy test: 0.9521531100478469
----------------------
XGBoost : 
{'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 100}
Accuracy train: 0.9556670442897989
Accuracy test: 0.9593301435406698
----------------------
SVM : 
{'C': 10, 'kernel': 'rbf'}
Accuracy train: 0.953863444282606
Accuracy test: 0.9593301435406698
----------------------
Naïve Bayes : 
{'alpha': 1, 'force_alpha': True}
Accuracy train: 0.9490694287101473
Accuracy test: 0.9497607655502392


In [14]:
model_training('HashingVectorizer',h_x_train, h_x_test, y_train, y_test)

----------------------
----------------------
HashingVectorizer
----------------------
----------------------
Logistic regression : 
{'C': 10}
Accuracy train: 0.9670514826203448
Accuracy test: 0.9808612440191388
----------------------
KNN : 
{'n_neighbors': 9, 'weights': 'distance'}
Accuracy train: 0.9430903957850065
Accuracy test: 0.9641148325358851
----------------------
Random foreest : 
{'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Accuracy train: 0.9071424718131306
Accuracy test: 0.9210526315789473
----------------------
XGBoost : 
{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}
Accuracy train: 0.9610544676412939
Accuracy test: 0.9760765550239234
----------------------
SVM : 
{'C': 10, 'kernel': 'linear'}
Accuracy train: 0.9676520832209455
Accuracy test: 0.9856459330143541
----------------------
Naïve Bayes : 
{'alpha': 0.1, 'force_alpha': True}
Accuracy train: 0.6848393303483122
Accuracy test: 0.6770334928229665


In [17]:
model_training('TfidfVectorizer',t_x_train, t_x_test, y_train, y_test)

----------------------
----------------------
TfidfVectorizer
----------------------
----------------------
Logistic regression : 
{'C': 10}
Accuracy train: 0.953867040693388
Accuracy test: 0.9617224880382775
----------------------
KNN : 
{'n_neighbors': 7, 'weights': 'distance'}
Accuracy train: 0.9089262915610222
Accuracy test: 0.9425837320574163
----------------------
Random foreest : 
{'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}
Accuracy train: 0.9562622502742263
Accuracy test: 0.9569377990430622
----------------------
XGBoost : 
{'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 100}
Accuracy train: 0.9574652496808185
Accuracy test: 0.9617224880382775
----------------------
SVM : 
{'C': 1, 'kernel': 'rbf'}
Accuracy train: 0.9526640412867957
Accuracy test: 0.9688995215311005
----------------------
Naïve Bayes : 
{'alpha': 1, 'force_alpha': True}
Accuracy train: 0.9490694287101473
Accuracy test: 0.9497607655502392
