# Tweet Sentiment Classifier - NLP Problem:

In [1]:
## Importing required libraries....
import pandas as pd
import numpy as np
from sklearn import metrics
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

pd.options.display.max_rows = None
pd.options.display.max_columns = None

In [2]:
## reading train data....
data = pd.read_csv('train_2kmZucJ.csv')
data.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...


### About Features:
- Label : '1' - Negative Tweet and '0' - Positive Tweet.
- Tweet : Tweet posted by social media users.

In [3]:
test_data = pd.read_csv('test_oJQbWVk.csv')
test_data.head()

Unnamed: 0,id,tweet
0,7921,I hate the new #iphone upgrade. Won't let me d...
1,7922,currently shitting my fucking pants. #apple #i...
2,7923,"I'd like to puts some CD-ROMS on my iPad, is t..."
3,7924,My ipod is officially dead. I lost all my pict...
4,7925,Been fighting iTunes all night! I only want th...


In [4]:
## Creating a feature with random value in Test Data....
test_data['label'] = 999999

In [5]:
print("Dimensions of Train data is :" , data.shape , "\nDimensions of Test data is :" , test_data.shape)

Dimensions of Train data is : (7920, 3) 
Dimensions of Test data is : (1953, 3)


In [6]:
## appending test data to train data for preprocessing.....
df = data.append(test_data)
df.reset_index(inplace = True , drop = True)

In [7]:
print("Dimensions of Combined data 'df' is :" , df.shape)

Dimensions of Combined data 'df' is : (9873, 3)


In [8]:
## Removing links and converting all tweets to lower case.....
for i in range(df.shape[0]):
    sent = df['tweet'][i]
    sent = sent.lower()
    sent = sent.split()
    char_list_to_remove = ['http']
    sent = [word for word in sent if all(ch not in word for ch in char_list_to_remove)]
    sent = " ".join(sent)
    sent = re.sub('[^a-zA-Z]' , ' ' , sent)
    df['tweet'][i] = sent

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


In [9]:
## Removing all single alphabets from all the tweets.....
for i in range(df.shape[0]):
    sent = df['tweet'][i]
    sent = sent.split()
    char_list_to_remove = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 
                           'u', 'v', 'w', 'x', 'y', 'z']
    sent = [word for word in sent if word not in char_list_to_remove]
    sent = " ".join(sent)
    df['tweet'][i] = sent

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [10]:
df.head()

Unnamed: 0,id,label,tweet
0,1,0,fingerprint pregnancy test android apps beauti...
1,2,0,finally transparant silicon case thanks to my ...
2,3,0,we love this would you go talk makememories un...
3,4,0,wired know george was made that way iphone cut...
4,5,1,what amazing service apple won even talk to me...


In [11]:
## Removing all Stopwords from the Tweets and applying WORD LEMMATIZATION...
lem = WordNetLemmatizer()

for i in range(df.shape[0]):
    sent = df['tweet'][i]
    sent = sent.split()
    sent = [lem.lemmatize(word) for word in sent if word not in set(stopwords.words('english'))]
    sent = " ".join(sent)
    df['tweet'][i] = sent

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [12]:
df.head()

Unnamed: 0,id,label,tweet
0,1,0,fingerprint pregnancy test android apps beauti...
1,2,0,finally transparant silicon case thanks uncle ...
2,3,0,love would go talk makememories unplug relax i...
3,4,0,wired know george made way iphone cute daventr...
4,5,1,amazing service apple even talk question unles...


In [13]:
## Applying Train Test Split.....
training = df.iloc[0:data.shape[0] , :]
test_submission = df.iloc[data.shape[0]: , :]

In [14]:
## Apply vectorization using TF-IDF....
from sklearn.feature_extraction.text import TfidfVectorizer

Tfidf = TfidfVectorizer()
Tfidf.fit(training['tweet'])
X = Tfidf.transform(training['tweet'])

In [15]:
## train-test split for model training and validation....
from sklearn.model_selection import train_test_split
x_train , x_test , y_train , y_test = train_test_split(X, training['label'], test_size = 0.2, random_state=10 )

## Model :

#### Naive Bayes

In [16]:
from sklearn.naive_bayes import MultinomialNB

NV = MultinomialNB()
NV.fit(x_train , y_train)
print("Train accuracy score:" , NV.score(x_train , y_train)*100)

Train accuracy score: 91.28787878787878


In [17]:
print("Test accuracy score:" , NV.score(x_test , y_test)*100)

Test accuracy score: 84.02777777777779


In [18]:
print("Test Data - Confusion Matrix:\n")
print(pd.DataFrame(metrics.confusion_matrix(y_test , NV.predict(x_test))))

Test Data - Confusion Matrix:

      0    1
0  1129   33
1   220  202


#### Logistic Regression as Classifier:

In [19]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression()
log_reg.fit(x_train , y_train)
print("Test accuracy score:" , log_reg.score(x_train , y_train)*100)

Test accuracy score: 92.17171717171718


In [20]:
print("Test accuracy score:" , log_reg.score(x_test , y_test)*100)

Test accuracy score: 86.61616161616162


In [21]:
print("Test Data - Confusion Matrix:\n")
print(pd.DataFrame(metrics.confusion_matrix(y_test , log_reg.predict(x_test))))
print(metrics.classification_report(y_test , log_reg.predict(x_test)))

Test Data - Confusion Matrix:

      0    1
0  1113   49
1   163  259
              precision    recall  f1-score   support

           0       0.87      0.96      0.91      1162
           1       0.84      0.61      0.71       422

    accuracy                           0.87      1584
   macro avg       0.86      0.79      0.81      1584
weighted avg       0.86      0.87      0.86      1584



#### Random Forest Classifier:

In [22]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier()
rf_clf.fit(x_train , y_train)
print("Test accuracy score:" , rf_clf.score(x_train , y_train)*100)

Test accuracy score: 99.96843434343434


In [23]:
print("Test accuracy score:" , rf_clf.score(x_test , y_test)*100)

Test accuracy score: 85.03787878787878


In [24]:
print("Test Data - Confusion Matrix:\n")
print(pd.DataFrame(metrics.confusion_matrix(y_test , rf_clf.predict(x_test))))
print(metrics.classification_report(y_test , rf_clf.predict(x_test)))

Test Data - Confusion Matrix:

      0    1
0  1100   62
1   175  247
              precision    recall  f1-score   support

           0       0.86      0.95      0.90      1162
           1       0.80      0.59      0.68       422

    accuracy                           0.85      1584
   macro avg       0.83      0.77      0.79      1584
weighted avg       0.85      0.85      0.84      1584



#### Gradient Boosting Classifier:


In [25]:
from sklearn.ensemble import GradientBoostingClassifier

gb_clf = GradientBoostingClassifier(learning_rate=0.05, n_estimators=200)
gb_clf.fit(x_train , y_train)
print("Test accuracy score:" , gb_clf.score(x_train , y_train)*100)

Test accuracy score: 86.58459595959596


In [26]:
print("Test accuracy score:" , gb_clf.score(x_test , y_test)*100)

Test accuracy score: 82.57575757575758


In [27]:
print("Test Data - Confusion Matrix:\n")
print(pd.DataFrame(metrics.confusion_matrix(y_test , gb_clf.predict(x_test))))
print(metrics.classification_report(y_test , gb_clf.predict(x_test)))

Test Data - Confusion Matrix:

      0    1
0  1118   44
1   232  190
              precision    recall  f1-score   support

           0       0.83      0.96      0.89      1162
           1       0.81      0.45      0.58       422

    accuracy                           0.83      1584
   macro avg       0.82      0.71      0.73      1584
weighted avg       0.82      0.83      0.81      1584



#### XGBOOST Classifier:


In [28]:
from xgboost import XGBClassifier

xg_clf = XGBClassifier(n_estimators = 400)
xg_clf.fit(x_train , y_train)
print("\nTest accuracy score:" , xg_clf.score(x_train , y_train)*100)




Test accuracy score: 98.48484848484848


In [29]:
print("Test accuracy score:" , xg_clf.score(x_test , y_test)*100)

Test accuracy score: 86.9949494949495


In [30]:
print("Test Data - Confusion Matrix:\n")
print(pd.DataFrame(metrics.confusion_matrix(y_test , xg_clf.predict(x_test))))

Test Data - Confusion Matrix:

      0    1
0  1068   94
1   112  310


In [31]:
print(metrics.classification_report(y_test , xg_clf.predict(x_test)))

              precision    recall  f1-score   support

           0       0.91      0.92      0.91      1162
           1       0.77      0.73      0.75       422

    accuracy                           0.87      1584
   macro avg       0.84      0.83      0.83      1584
weighted avg       0.87      0.87      0.87      1584



## Preprocessing Submission Test data:

In [32]:
## Applying vectorization....
test_input = Tfidf.transform(test_submission['tweet'])

In [33]:
submission = pd.read_csv('sample_submission_LnhVWA4.csv')
submission.head()

Unnamed: 0,id,label
0,7921,0
1,7922,0
2,7923,0
3,7924,0
4,7925,0


In [34]:
results = log_reg.predict(test_input)
submission['label'] = results

In [35]:
submission.to_csv("Case_Submission.csv" , index = False)

## Things to work on:
- This Models can further be improved by the use of proper sampling method to balance the data.
- We can also make use of better **WORD EMBEDDING Technique** like **WORD2VEC** and apply **ANN** for better results.