<a href="https://colab.research.google.com/github/RahulDevjani/PRODIGY_DS_04/blob/main/Model_Building.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [40]:
# Load data-preprocessing libraries
import pandas as pd
import numpy as np

# Text processing libraries
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Load data-visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Model building
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

# Evaluation metrics
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Set Pandas option to display all columns
pd.set_option('display.max_columns', None)

# Set Pandas option to display all rows
pd.set_option('display.max_rows', None)

# Set Pandas option to display all values
pd.set_option('display.max_colwidth', None)


In [38]:
#Read the dataset
df=pd.read_csv('Twitter_cleaned.csv',index_col='Unnamed: 0')
df.head()

Unnamed: 0,Sentiment,Tweet_word_count,Tweet_char_count,Tweet_clean,ApexLegends,AssassinsCreed,Battlefield,Borderlands,CS-GO,CallOfDuty,CallOfDutyBlackopsColdWar,Cyberpunk2077,Dota2,FIFA,Facebook,Fortnite,Google,GrandTheftAuto(GTA),Hearthstone,HomeDepot,LeagueOfLegends,MaddenNFL,Microsoft,NBA2K,Nvidia,Overwatch,PlayStation5(PS5),PlayerUnknownsBattlegrounds(PUBG),RedDeadRedemption(RDR),TomClancysGhostRecon,TomClancysRainbowSix,Verizon,WorldOfCraft,Xbox(Xseries),johnson&johnson
0,1,11,43,im getting borderland murder,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,12,40,coming border kill,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1,10,41,im getting borderland kill,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,10,42,im coming borderland murder,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1,12,46,im getting borderland murder,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [39]:
df.shape

(73996, 35)

In [None]:
#check for null values
(df.isnull().sum()/len(df))*100

Sentiment                            0.000000
Tweet_word_count                     0.000000
Tweet_char_count                     0.000000
Tweet_clean                          2.310936
ApexLegends                          0.000000
AssassinsCreed                       0.000000
Battlefield                          0.000000
Borderlands                          0.000000
CS-GO                                0.000000
CallOfDuty                           0.000000
CallOfDutyBlackopsColdWar            0.000000
Cyberpunk2077                        0.000000
Dota2                                0.000000
FIFA                                 0.000000
Facebook                             0.000000
Fortnite                             0.000000
Google                               0.000000
GrandTheftAuto(GTA)                  0.000000
Hearthstone                          0.000000
HomeDepot                            0.000000
LeagueOfLegends                      0.000000
MaddenNFL                         

In [None]:
df.dropna(axis=0,how='any',inplace=True)

In [None]:
df.isnull().sum()

Sentiment                            0
Tweet_word_count                     0
Tweet_char_count                     0
Tweet_clean                          0
ApexLegends                          0
AssassinsCreed                       0
Battlefield                          0
Borderlands                          0
CS-GO                                0
CallOfDuty                           0
CallOfDutyBlackopsColdWar            0
Cyberpunk2077                        0
Dota2                                0
FIFA                                 0
Facebook                             0
Fortnite                             0
Google                               0
GrandTheftAuto(GTA)                  0
Hearthstone                          0
HomeDepot                            0
LeagueOfLegends                      0
MaddenNFL                            0
Microsoft                            0
NBA2K                                0
Nvidia                               0
Overwatch                

In [None]:
#Seperate dependent and independent features
X=df.loc[:,df.columns!='Sentiment']
y=df['Sentiment']

In [None]:
# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=0)
# summarize
print('Train', X_train.shape, y_train.shape)
print('Test', X_valid.shape, y_valid.shape)

Train (50600, 34) (50600,)
Test (21686, 34) (21686,)


In [None]:
vectorizer = TfidfVectorizer(stop_words='english')
features_train= vectorizer.fit_transform(X_train['Tweet_clean'])
features_valid= vectorizer.transform(X_valid['Tweet_clean'])
features_train.shape, features_valid.shape

((50600, 30673), (21686, 30673))

In [None]:
#Function to fit and apply a model
def model_apply(model):
    #train the model
    model.fit(features_train,y_train)
    #make predictions
    pred=model.predict(features_valid)
    #model evaluation
    print(model)
    print('Accuracy score: ',accuracy_score(pred,y_valid))
    print('Weighted F1 score: ',f1_score(y_pred=pred,y_true=y_valid,average='weighted'))
    print('Confusion Matrix: \n',confusion_matrix(pred,y_valid))

In [None]:
#Multinomial Naive Bayes
nb=MultinomialNB()
model_apply(nb)

MultinomialNB()
Accuracy score:  0.7666236281471918
Weighted F1 score:  0.7644683478536343
Confusion Matrix: 
 [[4974  779  561]
 [1319 7782 1648]
 [ 250  504 3869]]


In [None]:
#Logistic Regression
lr=LogisticRegression(random_state=10,max_iter=500)
model_apply(lr)

LogisticRegression(max_iter=500, random_state=10)
Accuracy score:  0.7885732730794061
Weighted F1 score:  0.7880963850386071
Confusion Matrix: 
 [[5116  766  452]
 [1048 7544 1185]
 [ 379  755 4441]]


In [None]:
#Decision Tree
dtc=DecisionTreeClassifier(random_state=10)
model_apply(dtc)

DecisionTreeClassifier(random_state=10)
Accuracy score:  0.8060499861661902
Weighted F1 score:  0.8059525946403627
Confusion Matrix: 
 [[5330  824  446]
 [ 844 7426  908]
 [ 369  815 4724]]


In [None]:
#Random Forest
rf=RandomForestClassifier(random_state=101)
model_apply(rf)

RandomForestClassifier(random_state=101)
Accuracy score:  0.9060684312459651
Weighted F1 score:  0.906005151471547
Confusion Matrix: 
 [[5915  310  183]
 [ 479 8414  575]
 [ 149  341 5320]]
