# Twitter Sentiment Analysis using Random Forest Classifier

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df=pd.read_csv('twitter_training.csv')
df

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...
...,...,...,...,...
74676,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74677,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74678,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74679,9200,Nvidia,Positive,Just realized between the windows partition of...


In [3]:
df.head()

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [4]:
df_new=df.dropna(axis=0)

In [5]:
df_new

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...
...,...,...,...,...
74676,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74677,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74678,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74679,9200,Nvidia,Positive,Just realized between the windows partition of...


In [6]:
new_df=df_new.drop(['2401','Borderlands'],axis=1)

In [7]:
new_df.rename(columns={'Positive':'Sentiment','im getting on borderlands and i will murder you all ,':'Text'},inplace=True)

In [8]:
new_df

Unnamed: 0,Sentiment,Text
0,Positive,I am coming to the borders and I will kill you...
1,Positive,im getting on borderlands and i will kill you ...
2,Positive,im coming on borderlands and i will murder you...
3,Positive,im getting on borderlands 2 and i will murder ...
4,Positive,im getting into borderlands and i can murder y...
...,...,...
74676,Positive,Just realized that the Windows partition of my...
74677,Positive,Just realized that my Mac window partition is ...
74678,Positive,Just realized the windows partition of my Mac ...
74679,Positive,Just realized between the windows partition of...


In [9]:
new_df.head()

Unnamed: 0,Sentiment,Text
0,Positive,I am coming to the borders and I will kill you...
1,Positive,im getting on borderlands and i will kill you ...
2,Positive,im coming on borderlands and i will murder you...
3,Positive,im getting on borderlands 2 and i will murder ...
4,Positive,im getting into borderlands and i can murder y...


In [10]:
new_df.isnull().sum()

Sentiment    0
Text         0
dtype: int64

In [11]:
new_df.duplicated().sum()

4227

In [12]:
new_df.drop_duplicates()

Unnamed: 0,Sentiment,Text
0,Positive,I am coming to the borders and I will kill you...
1,Positive,im getting on borderlands and i will kill you ...
2,Positive,im coming on borderlands and i will murder you...
3,Positive,im getting on borderlands 2 and i will murder ...
4,Positive,im getting into borderlands and i can murder y...
...,...,...
74676,Positive,Just realized that the Windows partition of my...
74677,Positive,Just realized that my Mac window partition is ...
74678,Positive,Just realized the windows partition of my Mac ...
74679,Positive,Just realized between the windows partition of...


In [13]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [14]:
def clean_text(text):
    text = re.sub(r"[^\w\s]",'',text)
    text = re.sub(r"https\S+",'',text)
    text = re.sub(r"\s+",' ',text).strip()
    return text.lower()

In [15]:
new_df['Text']=new_df['Text'].apply(clean_text)

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [17]:
tf = TfidfVectorizer(max_features=10000,stop_words="english")
X = tf.fit_transform(new_df['Text']).toarray()
encoder = LabelEncoder()
y = encoder.fit_transform(new_df['Sentiment'])

In [18]:
X_arr = np.array(X)
y_arr = np.array(y)

In [19]:
X_train,X_test,y_train,y_test = train_test_split(X_arr,y_arr,test_size=0.2,random_state=0)

# Model Building 

In [20]:
from sklearn.ensemble import RandomForestClassifier

In [22]:
model = RandomForestClassifier(criterion='entropy')
model.fit(X_train,y_train)

In [24]:
y_pred=model.predict(X_test)

In [25]:
y_pred

array([1, 1, 2, ..., 3, 1, 0])

In [23]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [26]:
accuracy1=accuracy_score(y_test,y_pred)

In [27]:
accuracy1

0.8899925670653422

In [28]:
classification1=classification_report(y_test,y_pred)

In [29]:
print(classification1)

              precision    recall  f1-score   support

           0       0.93      0.82      0.87      2530
           1       0.92      0.92      0.92      4459
           2       0.91      0.87      0.89      3635
           3       0.83      0.93      0.88      4175

    accuracy                           0.89     14799
   macro avg       0.90      0.88      0.89     14799
weighted avg       0.89      0.89      0.89     14799



In [30]:
confusion1=confusion_matrix(y_test,y_pred)
confusion1

array([[2063,  102,  103,  262],
       [  38, 4084,  105,  232],
       [  51,  128, 3157,  299],
       [  58,  127,  123, 3867]], dtype=int64)