## Perform imports and load the dataset

In [2]:
import numpy as np
import pandas as pd
#!pip install nltk
#import nltk
#nltk.download()

In [3]:
import re # for regular expressions
pd.set_option("display.max_colwidth", 200)
import matplotlib.pyplot as plt 
import seaborn as sns
import string
import nltk # for text manipulation
import warnings 
warnings.filterwarnings("ignore", category=DeprecationWarning)
%matplotlib inline

In [134]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\sibusiso\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [4]:
df_train = pd.read_csv('https://raw.githubusercontent.com/SibusisoTL/classification-predict/master/train.csv')
df_test = pd.read_csv('https://raw.githubusercontent.com/SibusisoTL/classification-predict/master/test.csv')

In [5]:
sample_submission = pd.read_csv('https://raw.githubusercontent.com/SibusisoTL/classification-predict/master/sample_submission.csv')

In [6]:
sample_submission.head()

Unnamed: 0,tweetid,sentiment
0,169760,1
1,35326,1
2,224985,1
3,476263,1
4,872928,1


In [7]:
df_train.head()

Unnamed: 0,sentiment,message,tweetid
0,1,"PolySciMajor EPA chief doesn't think carbon dioxide is main cause of global warming and.. wait, what!? https://t.co/yeLvcEFXkC via @mashable",625221
1,1,It's not like we lack evidence of anthropogenic global warming,126103
2,2,RT @RawStory: Researchers say we have three years to act on climate change before it’s too late https://t.co/WdT0KdUr2f https://t.co/Z0ANPT…,698562
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year in the war on climate change https://t.co/44wOTxTLcD,573736
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, sexist, climate change denying bigot is leading in the polls. #ElectionNight",466954


In [8]:
df_test.head()

Unnamed: 0,message,tweetid
0,Europe will now be looking to China to make sure that it is not alone in fighting climate change… https://t.co/O7T8rCgwDq,169760
1,Combine this with the polling of staffers re climate change and womens' rights and you have a fascist state. https://t.co/ifrm7eexpj,35326
2,"The scary, unimpeachable evidence that climate change is already here: https://t.co/yAedqcV9Ki #itstimetochange #climatechange @ZEROCO2_;..",224985
3,@Karoli @morgfair @OsborneInk @dailykos \nPutin got to you too Jill ! \nTrump doesn't believe in climate change at all \nThinks it's s hoax,476263
4,RT @FakeWillMoore: 'Female orgasms cause global warming!'\n-Sarcastic Republican,872928


In [9]:
print(len(df_train))

15819


In [10]:
print(len(df_test))

10546


In [11]:
df_train.isnull().sum()

sentiment    0
message      0
tweetid      0
dtype: int64

In [12]:
df_test.isnull().sum()

message    0
tweetid    0
dtype: int64

In [13]:
df_train['sentiment'].value_counts()

 1    8530
 2    3640
 0    2353
-1    1296
Name: sentiment, dtype: int64

In [14]:
# Check for whitespace strings (it's OK if there aren't any!):
blanks = []  # start with an empty list

for i,sntm,msg,twts in df_train.itertuples():  # iterate over the DataFrame
    if type(msg)==str:            # avoid NaN values
        if msg.isspace():         # test 'message' for whitespace
            blanks.append(i)     # add matching index numbers to the list
        
len(blanks)

0

In [15]:
df = df_train.copy()

In [105]:
#Split the train data into train and validation 

from sklearn.model_selection import train_test_split

X = df["message"]
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

## Build a pipeline to vectorize the date, then train and fit a model
Use LinearSVC..

In [106]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LinearSVC()),
])

# Feed the training data through the pipeline
text_clf.fit(X_train, y_train)  

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])

In [107]:
# Form a prediction set
predictions = text_clf.predict(X_test)

In [108]:
# Report the confusion matrix
from sklearn import metrics
print(metrics.confusion_matrix(y_test,predictions))

[[ 100   28   70    8]
 [  10  135  146   22]
 [  21   45 1155  104]
 [   5    6   78  440]]


In [109]:
# Print a classification report
print(metrics.classification_report(y_test,predictions))

              precision    recall  f1-score   support

          -1       0.74      0.49      0.58       206
           0       0.63      0.43      0.51       313
           1       0.80      0.87      0.83      1325
           2       0.77      0.83      0.80       529

    accuracy                           0.77      2373
   macro avg       0.73      0.66      0.68      2373
weighted avg       0.76      0.77      0.76      2373



In [110]:
# Print the overall accuracy
print(metrics.accuracy_score(y_test,predictions))

0.7711757269279393


## Cleaning dataframe
1. Add punctuation column
2. Add text lenght
3. Add polarity score (Compound score)

In [148]:
dataframe = df_train.copy()

In [149]:
#using regular expressions to remove tagy symbols
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
        
    return input_txt

In [150]:
dataframe['tidy_tweet'] = np.vectorize(remove_pattern)(dataframe['message'], "@[\w]*") 
dataframe.head()

Unnamed: 0,sentiment,message,tweetid,tidy_tweet
0,1,"PolySciMajor EPA chief doesn't think carbon dioxide is main cause of global warming and.. wait, what!? https://t.co/yeLvcEFXkC via @mashable",625221,"PolySciMajor EPA chief doesn't think carbon dioxide is main cause of global warming and.. wait, what!? https://t.co/yeLvcEFXkC via"
1,1,It's not like we lack evidence of anthropogenic global warming,126103,It's not like we lack evidence of anthropogenic global warming
2,2,RT @RawStory: Researchers say we have three years to act on climate change before it’s too late https://t.co/WdT0KdUr2f https://t.co/Z0ANPT…,698562,RT : Researchers say we have three years to act on climate change before it’s too late https://t.co/WdT0KdUr2f https://t.co/Z0ANPT…
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year in the war on climate change https://t.co/44wOTxTLcD,573736,#TodayinMaker# WIRED : 2016 was a pivotal year in the war on climate change https://t.co/44wOTxTLcD
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, sexist, climate change denying bigot is leading in the polls. #ElectionNight",466954,"RT : It's 2016, and a racist, sexist, climate change denying bigot is leading in the polls. #ElectionNight"


In [152]:
#removing spacial charectors using regular expressions
dataframe['tidy_tweet'] = dataframe['tidy_tweet'].str.replace("[^a-zA-Z#]", " ")
dataframe.head()

Unnamed: 0,sentiment,message,tweetid,tidy_tweet
0,1,"PolySciMajor EPA chief doesn't think carbon dioxide is main cause of global warming and.. wait, what!? https://t.co/yeLvcEFXkC via @mashable",625221,PolySciMajor EPA chief doesn t think carbon dioxide is main cause of global warming and wait what https t co yeLvcEFXkC via
1,1,It's not like we lack evidence of anthropogenic global warming,126103,It s not like we lack evidence of anthropogenic global warming
2,2,RT @RawStory: Researchers say we have three years to act on climate change before it’s too late https://t.co/WdT0KdUr2f https://t.co/Z0ANPT…,698562,RT Researchers say we have three years to act on climate change before it s too late https t co WdT KdUr f https t co Z ANPT
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year in the war on climate change https://t.co/44wOTxTLcD,573736,#TodayinMaker# WIRED was a pivotal year in the war on climate change https t co wOTxTLcD
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, sexist, climate change denying bigot is leading in the polls. #ElectionNight",466954,RT It s and a racist sexist climate change denying bigot is leading in the polls #ElectionNight


In [162]:
dataframe['lenght'] = dataframe['message'].str.split().str.len()
dataframe.head()

Unnamed: 0,sentiment,message,tweetid,tidy_tweet,lenght,count_punct,scores,compound
0,1,"PolySciMajor EPA chief doesn't think carbon dioxide is main cause of global warming and.. wait, what!? https://t.co/yeLvcEFXkC via @mashable",625221,PolySciMajor EPA chief doesn t think carbon dioxide is main cause of global warming and wait what https t co yeLvcEFXkC via,19,0,"{'neg': 0.0, 'neu': 0.922, 'pos': 0.078, 'compound': 0.1531}",0.1531
1,1,It's not like we lack evidence of anthropogenic global warming,126103,It s not like we lack evidence of anthropogenic global warming,10,0,"{'neg': 0.167, 'neu': 0.552, 'pos': 0.281, 'compound': 0.1159}",0.1159
2,2,RT @RawStory: Researchers say we have three years to act on climate change before it’s too late https://t.co/WdT0KdUr2f https://t.co/Z0ANPT…,698562,RT Researchers say we have three years to act on climate change before it s too late https t co WdT KdUr f https t co Z ANPT,19,0,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}",0.0
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year in the war on climate change https://t.co/44wOTxTLcD,573736,#TodayinMaker# WIRED was a pivotal year in the war on climate change https t co wOTxTLcD,15,2,"{'neg': 0.231, 'neu': 0.769, 'pos': 0.0, 'compound': -0.5994}",-0.5994
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, sexist, climate change denying bigot is leading in the polls. #ElectionNight",466954,RT It s and a racist sexist climate change denying bigot is leading in the polls #ElectionNight,18,1,"{'neg': 0.33, 'neu': 0.67, 'pos': 0.0, 'compound': -0.7506}",-0.7506


In [163]:
tidy_tweet = lambda l1,l2: sum([1 for x in l1 if x in l2])

dataframe['count_punct'] = dataframe.message.apply(lambda s: count(s, string.punctuation))
dataframe.head()

Unnamed: 0,sentiment,message,tweetid,tidy_tweet,lenght,count_punct,scores,compound
0,1,"PolySciMajor EPA chief doesn't think carbon dioxide is main cause of global warming and.. wait, what!? https://t.co/yeLvcEFXkC via @mashable",625221,PolySciMajor EPA chief doesn t think carbon dioxide is main cause of global warming and wait what https t co yeLvcEFXkC via,19,12,"{'neg': 0.0, 'neu': 0.922, 'pos': 0.078, 'compound': 0.1531}",0.1531
1,1,It's not like we lack evidence of anthropogenic global warming,126103,It s not like we lack evidence of anthropogenic global warming,10,1,"{'neg': 0.167, 'neu': 0.552, 'pos': 0.281, 'compound': 0.1159}",0.1159
2,2,RT @RawStory: Researchers say we have three years to act on climate change before it’s too late https://t.co/WdT0KdUr2f https://t.co/Z0ANPT…,698562,RT Researchers say we have three years to act on climate change before it s too late https t co WdT KdUr f https t co Z ANPT,19,12,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}",0.0
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year in the war on climate change https://t.co/44wOTxTLcD,573736,#TodayinMaker# WIRED was a pivotal year in the war on climate change https t co wOTxTLcD,15,8,"{'neg': 0.231, 'neu': 0.769, 'pos': 0.0, 'compound': -0.5994}",-0.5994
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, sexist, climate change denying bigot is leading in the polls. #ElectionNight",466954,RT It s and a racist sexist climate change denying bigot is leading in the polls #ElectionNight,18,8,"{'neg': 0.33, 'neu': 0.67, 'pos': 0.0, 'compound': -0.7506}",-0.7506


In [164]:
#VADER's SentimentIntensityAnalyzer() takes in a string and returns a dictionary of scores in each of four categories:
#negative, neutral, positive, compound (computed by normalizing the scores above

dataframe['scores'] = dataframe['message'].apply(lambda message: sid.polarity_scores(message))
dataframe.head()

Unnamed: 0,sentiment,message,tweetid,tidy_tweet,lenght,count_punct,scores,compound
0,1,"PolySciMajor EPA chief doesn't think carbon dioxide is main cause of global warming and.. wait, what!? https://t.co/yeLvcEFXkC via @mashable",625221,PolySciMajor EPA chief doesn t think carbon dioxide is main cause of global warming and wait what https t co yeLvcEFXkC via,19,12,"{'neg': 0.0, 'neu': 0.905, 'pos': 0.095, 'compound': 0.2244}",0.1531
1,1,It's not like we lack evidence of anthropogenic global warming,126103,It s not like we lack evidence of anthropogenic global warming,10,1,"{'neg': 0.167, 'neu': 0.552, 'pos': 0.281, 'compound': 0.1159}",0.1159
2,2,RT @RawStory: Researchers say we have three years to act on climate change before it’s too late https://t.co/WdT0KdUr2f https://t.co/Z0ANPT…,698562,RT Researchers say we have three years to act on climate change before it s too late https t co WdT KdUr f https t co Z ANPT,19,12,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}",0.0
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year in the war on climate change https://t.co/44wOTxTLcD,573736,#TodayinMaker# WIRED was a pivotal year in the war on climate change https t co wOTxTLcD,15,8,"{'neg': 0.245, 'neu': 0.755, 'pos': 0.0, 'compound': -0.5994}",-0.5994
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, sexist, climate change denying bigot is leading in the polls. #ElectionNight",466954,RT It s and a racist sexist climate change denying bigot is leading in the polls #ElectionNight,18,8,"{'neg': 0.299, 'neu': 0.701, 'pos': 0.0, 'compound': -0.7506}",-0.7506


In [165]:
dataframe['compound']  = dataframe['scores'].apply(lambda score_dict: score_dict['compound'])
dataframe.head()

Unnamed: 0,sentiment,message,tweetid,tidy_tweet,lenght,count_punct,scores,compound
0,1,"PolySciMajor EPA chief doesn't think carbon dioxide is main cause of global warming and.. wait, what!? https://t.co/yeLvcEFXkC via @mashable",625221,PolySciMajor EPA chief doesn t think carbon dioxide is main cause of global warming and wait what https t co yeLvcEFXkC via,19,12,"{'neg': 0.0, 'neu': 0.905, 'pos': 0.095, 'compound': 0.2244}",0.2244
1,1,It's not like we lack evidence of anthropogenic global warming,126103,It s not like we lack evidence of anthropogenic global warming,10,1,"{'neg': 0.167, 'neu': 0.552, 'pos': 0.281, 'compound': 0.1159}",0.1159
2,2,RT @RawStory: Researchers say we have three years to act on climate change before it’s too late https://t.co/WdT0KdUr2f https://t.co/Z0ANPT…,698562,RT Researchers say we have three years to act on climate change before it s too late https t co WdT KdUr f https t co Z ANPT,19,12,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}",0.0
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year in the war on climate change https://t.co/44wOTxTLcD,573736,#TodayinMaker# WIRED was a pivotal year in the war on climate change https t co wOTxTLcD,15,8,"{'neg': 0.245, 'neu': 0.755, 'pos': 0.0, 'compound': -0.5994}",-0.5994
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, sexist, climate change denying bigot is leading in the polls. #ElectionNight",466954,RT It s and a racist sexist climate change denying bigot is leading in the polls #ElectionNight,18,8,"{'neg': 0.299, 'neu': 0.701, 'pos': 0.0, 'compound': -0.7506}",-0.7506


## Training cleaned dataset and applying logistic regression model

In [166]:
#Split the train data into train and validation 

from sklearn.model_selection import train_test_split

X = dataframe[["lenght","count_punct", "compound"]]
y = dataframe['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

In [167]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(solver='lbfgs')

lr_model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [168]:
from sklearn import metrics

# Create a prediction set:
predictions = lr_model.predict(X_test)

# Print a confusion matrix
print(metrics.confusion_matrix(y_test,predictions))

[[   0    4  177   25]
 [   0   46  219   48]
 [   0   14 1174  137]
 [   0    3  353  173]]


In [170]:
# You can make the confusion matrix less confusing by adding labels:
df = pd.DataFrame(metrics.confusion_matrix(y_test,predictions))
df

Unnamed: 0,0,1,2,3
0,0,4,177,25
1,0,46,219,48
2,0,14,1174,137
3,0,3,353,173


In [171]:
# Print a classification report
print(metrics.classification_report(y_test,predictions))

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00       206
           0       0.69      0.15      0.24       313
           1       0.61      0.89      0.72      1325
           2       0.45      0.33      0.38       529

    accuracy                           0.59      2373
   macro avg       0.44      0.34      0.34      2373
weighted avg       0.53      0.59      0.52      2373



  _warn_prf(average, modifier, msg_start, len(result))


In [172]:
# Print the overall accuracy
print(metrics.accuracy_score(y_test,predictions))

0.5870206489675516


## Combine Steps with TfidVectorizer

In [111]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

X_train_tfidf = vectorizer.fit_transform(X_train) # remember to use the original X_train set
X_train_tfidf.shape

(13446, 28380)

## Train a Classifier
Here we'll introduce an SVM classifier that's similar to SVC, called LinearSVC. LinearSVC handles sparse input better, and scales well to large numbers of samples.

In [112]:
from sklearn.svm import LinearSVC
clf = LinearSVC()
clf.fit(X_train_tfidf,y_train)

LinearSVC()

## Build a Pipeline

In [113]:
from sklearn.pipeline import Pipeline
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.svm import LinearSVC

text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LinearSVC()),
])

# Feed the training data through the pipeline
text_clf.fit(X_train, y_train) 

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])

## Test the classifier and display results

In [114]:
# Form a prediction set
predictions = text_clf.predict(X_test)

In [115]:
# Report the confusion matrix
from sklearn import metrics
print(metrics.confusion_matrix(y_test,predictions))

[[ 100   28   70    8]
 [  10  135  146   22]
 [  21   45 1155  104]
 [   5    6   78  440]]


In [116]:
# Print a classification report
print(metrics.classification_report(y_test,predictions))

              precision    recall  f1-score   support

          -1       0.74      0.49      0.58       206
           0       0.63      0.43      0.51       313
           1       0.80      0.87      0.83      1325
           2       0.77      0.83      0.80       529

    accuracy                           0.77      2373
   macro avg       0.73      0.66      0.68      2373
weighted avg       0.76      0.77      0.76      2373



In [117]:
# Print the overall accuracy
print(metrics.accuracy_score(y_test,predictions))

0.7711757269279393


## Create csv to upload to Kaggle

In [118]:
#Make predictions using the features from the test data set
predictions = text_clf.predict(df_test['message'])
#predictions = clf.predict(df_test[])

#Display our predictions - they are either 0 or 1 for each training instance 
#depending on whether our algorithm believes the person survived or not.
predictions

array([1, 1, 1, ..., 1, 0, 1], dtype=int64)

In [120]:
#Create a  DataFrame with the tweetid and our prediction regarding whether it good or bad
submission3 = pd.DataFrame({'tweetid':df_test['tweetid'],'sentiment':predictions})

#Visualize the first 5 rows
submission3.head()

Unnamed: 0,tweetid,sentiment
0,169760,1
1,35326,1
2,224985,1
3,476263,1
4,872928,0


In [122]:
#This is saved in the same directory as your notebook
filename = 'submission3.csv'

submission3.to_csv(filename,index=False)

print('Saved file: ' + filename)

Saved file: submission3.csv


In [209]:
#Split the train data into train and validation 

from sklearn.model_selection import train_test_split

X = dataframe["message"]
y = dataframe['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

In [210]:
text_clf_lsvc = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LinearSVC()),
])

In [211]:
text_clf_lsvc.fit(X_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])

In [212]:
# Form a prediction set
predictions = text_clf_lsvc.predict(X_test)

In [213]:
# Report the confusion matrix
from sklearn import metrics
print(metrics.confusion_matrix(y_test,predictions))

[[ 100   28   70    8]
 [  10  135  146   22]
 [  21   45 1155  104]
 [   5    6   78  440]]


In [214]:
# Print a classification report
print(metrics.classification_report(y_test,predictions))

              precision    recall  f1-score   support

          -1       0.74      0.49      0.58       206
           0       0.63      0.43      0.51       313
           1       0.80      0.87      0.83      1325
           2       0.77      0.83      0.80       529

    accuracy                           0.77      2373
   macro avg       0.73      0.66      0.68      2373
weighted avg       0.76      0.77      0.76      2373



In [215]:
# Print the overall accuracy
print(metrics.accuracy_score(y_test,predictions))

0.7711757269279393


In [217]:
#Make predictions using the features from the test data set
predictions = text_clf_lsvc.predict(df_test['message'])
#predictions = clf.predict(df_test[])

#Display our predictions - they are either 0 or 1 for each training instance 
#depending on whether our algorithm believes the person survived or not.
predictions

array([1, 1, 1, ..., 1, 0, 1], dtype=int64)

In [218]:
#Create a  DataFrame with the tweetid and our prediction regarding whether it good or bad
submission4 = pd.DataFrame({'tweetid':df_test['tweetid'],'sentiment':predictions})

#Visualize the first 5 rows
submission4.head()

Unnamed: 0,tweetid,sentiment
0,169760,1
1,35326,1
2,224985,1
3,476263,1
4,872928,0


In [220]:
#This is saved in the same directory as your notebook
filename = 'submission4.csv'

submission4.to_csv(filename,index=False)

print('Saved file: ' + filename)

Saved file: submission4.csv


# Cleaning the data to improve the accuracy results

In [32]:
#copy the training dataset
train = df_train.copy()
test = df_test.copy()

In [33]:
#using regular expressions to remove tagy symbols
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
        
    return input_txt

In [34]:
test['tidy_tweet'] = np.vectorize(remove_pattern)(test['message'], "@[\w]*") 
test.head()

Unnamed: 0,message,tweetid,tidy_tweet
0,Europe will now be looking to China to make sure that it is not alone in fighting climate change… https://t.co/O7T8rCgwDq,169760,Europe will now be looking to China to make sure that it is not alone in fighting climate change… https://t.co/O7T8rCgwDq
1,Combine this with the polling of staffers re climate change and womens' rights and you have a fascist state. https://t.co/ifrm7eexpj,35326,Combine this with the polling of staffers re climate change and womens' rights and you have a fascist state. https://t.co/ifrm7eexpj
2,"The scary, unimpeachable evidence that climate change is already here: https://t.co/yAedqcV9Ki #itstimetochange #climatechange @ZEROCO2_;..",224985,"The scary, unimpeachable evidence that climate change is already here: https://t.co/yAedqcV9Ki #itstimetochange #climatechange ;.."
3,@Karoli @morgfair @OsborneInk @dailykos \nPutin got to you too Jill ! \nTrump doesn't believe in climate change at all \nThinks it's s hoax,476263,\nPutin got to you too Jill ! \nTrump doesn't believe in climate change at all \nThinks it's s hoax
4,RT @FakeWillMoore: 'Female orgasms cause global warming!'\n-Sarcastic Republican,872928,RT : 'Female orgasms cause global warming!'\n-Sarcastic Republican


In [35]:
#
train['tidy_tweet'] = np.vectorize(remove_pattern)(train['message'], "@[\w]*") 
train.head()

Unnamed: 0,sentiment,message,tweetid,tidy_tweet
0,1,"PolySciMajor EPA chief doesn't think carbon dioxide is main cause of global warming and.. wait, what!? https://t.co/yeLvcEFXkC via @mashable",625221,"PolySciMajor EPA chief doesn't think carbon dioxide is main cause of global warming and.. wait, what!? https://t.co/yeLvcEFXkC via"
1,1,It's not like we lack evidence of anthropogenic global warming,126103,It's not like we lack evidence of anthropogenic global warming
2,2,RT @RawStory: Researchers say we have three years to act on climate change before it’s too late https://t.co/WdT0KdUr2f https://t.co/Z0ANPT…,698562,RT : Researchers say we have three years to act on climate change before it’s too late https://t.co/WdT0KdUr2f https://t.co/Z0ANPT…
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year in the war on climate change https://t.co/44wOTxTLcD,573736,#TodayinMaker# WIRED : 2016 was a pivotal year in the war on climate change https://t.co/44wOTxTLcD
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, sexist, climate change denying bigot is leading in the polls. #ElectionNight",466954,"RT : It's 2016, and a racist, sexist, climate change denying bigot is leading in the polls. #ElectionNight"


In [36]:
#removing spacial charectors using regular expressions
test['tidy_tweet'] = test['tidy_tweet'].str.replace("[^a-zA-Z#]", " ")
test.head()

Unnamed: 0,message,tweetid,tidy_tweet
0,Europe will now be looking to China to make sure that it is not alone in fighting climate change… https://t.co/O7T8rCgwDq,169760,Europe will now be looking to China to make sure that it is not alone in fighting climate change https t co O T rCgwDq
1,Combine this with the polling of staffers re climate change and womens' rights and you have a fascist state. https://t.co/ifrm7eexpj,35326,Combine this with the polling of staffers re climate change and womens rights and you have a fascist state https t co ifrm eexpj
2,"The scary, unimpeachable evidence that climate change is already here: https://t.co/yAedqcV9Ki #itstimetochange #climatechange @ZEROCO2_;..",224985,The scary unimpeachable evidence that climate change is already here https t co yAedqcV Ki #itstimetochange #climatechange
3,@Karoli @morgfair @OsborneInk @dailykos \nPutin got to you too Jill ! \nTrump doesn't believe in climate change at all \nThinks it's s hoax,476263,Putin got to you too Jill Trump doesn t believe in climate change at all Thinks it s s hoax
4,RT @FakeWillMoore: 'Female orgasms cause global warming!'\n-Sarcastic Republican,872928,RT Female orgasms cause global warming Sarcastic Republican


In [37]:
#removing spacial charectors using regular expressions
train['tidy_tweet'] = train['tidy_tweet'].str.replace("[^a-zA-Z#]", " ")
train.head()

Unnamed: 0,sentiment,message,tweetid,tidy_tweet
0,1,"PolySciMajor EPA chief doesn't think carbon dioxide is main cause of global warming and.. wait, what!? https://t.co/yeLvcEFXkC via @mashable",625221,PolySciMajor EPA chief doesn t think carbon dioxide is main cause of global warming and wait what https t co yeLvcEFXkC via
1,1,It's not like we lack evidence of anthropogenic global warming,126103,It s not like we lack evidence of anthropogenic global warming
2,2,RT @RawStory: Researchers say we have three years to act on climate change before it’s too late https://t.co/WdT0KdUr2f https://t.co/Z0ANPT…,698562,RT Researchers say we have three years to act on climate change before it s too late https t co WdT KdUr f https t co Z ANPT
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year in the war on climate change https://t.co/44wOTxTLcD,573736,#TodayinMaker# WIRED was a pivotal year in the war on climate change https t co wOTxTLcD
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, sexist, climate change denying bigot is leading in the polls. #ElectionNight",466954,RT It s and a racist sexist climate change denying bigot is leading in the polls #ElectionNight


In [38]:
#remove 3 letter word
test['tidy_tweet'] = test['tidy_tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
test.head()

Unnamed: 0,message,tweetid,tidy_tweet
0,Europe will now be looking to China to make sure that it is not alone in fighting climate change… https://t.co/O7T8rCgwDq,169760,Europe will looking China make sure that alone fighting climate change https rCgwDq
1,Combine this with the polling of staffers re climate change and womens' rights and you have a fascist state. https://t.co/ifrm7eexpj,35326,Combine this with polling staffers climate change womens rights have fascist state https ifrm eexpj
2,"The scary, unimpeachable evidence that climate change is already here: https://t.co/yAedqcV9Ki #itstimetochange #climatechange @ZEROCO2_;..",224985,scary unimpeachable evidence that climate change already here https yAedqcV #itstimetochange #climatechange
3,@Karoli @morgfair @OsborneInk @dailykos \nPutin got to you too Jill ! \nTrump doesn't believe in climate change at all \nThinks it's s hoax,476263,Putin Jill Trump doesn believe climate change Thinks hoax
4,RT @FakeWillMoore: 'Female orgasms cause global warming!'\n-Sarcastic Republican,872928,Female orgasms cause global warming Sarcastic Republican


In [125]:
#remove 3 letter word
train['tidy_tweet'] = train['tidy_tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
train.head()

Unnamed: 0,sentiment,message,tweetid,tidy_tweet,lenght
0,1,"PolySciMajor EPA chief doesn't think carbon dioxide is main cause of global warming and.. wait, what!? https://t.co/yeLvcEFXkC via @mashable",625221,polyscimajor chief doesn think carbon dioxid main caus global warm wait what http yelvcefxkc,14
1,1,It's not like we lack evidence of anthropogenic global warming,126103,like lack evid anthropogen global warm,6
2,2,RT @RawStory: Researchers say we have three years to act on climate change before it’s too late https://t.co/WdT0KdUr2f https://t.co/Z0ANPT…,698562,research have three year climat chang befor late http kdur http anpt,12
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year in the war on climate change https://t.co/44wOTxTLcD,573736,#todayinmaker# wire pivot year climat chang http wotxtlcd,8
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, sexist, climate change denying bigot is leading in the polls. #ElectionNight",466954,racist sexist climat chang deni bigot lead poll #electionnight,9


In [40]:
#plitting sentaces into words
test_tokenized_tweet = test['tidy_tweet'].apply(lambda x: x.split()) # tokenizing

In [41]:
#plitting sentaces into words
train_tokenized_tweet = train['tidy_tweet'].apply(lambda x: x.split()) # tokenizing

In [42]:
test_tokenized_tweet.head()

0                           [Europe, will, looking, China, make, sure, that, alone, fighting, climate, change, https, rCgwDq]
1         [Combine, this, with, polling, staffers, climate, change, womens, rights, have, fascist, state, https, ifrm, eexpj]
2    [scary, unimpeachable, evidence, that, climate, change, already, here, https, yAedqcV, #itstimetochange, #climatechange]
3                                                         [Putin, Jill, Trump, doesn, believe, climate, change, Thinks, hoax]
4                                                            [Female, orgasms, cause, global, warming, Sarcastic, Republican]
Name: tidy_tweet, dtype: object

In [43]:
train_tokenized_tweet.head()

0    [PolySciMajor, chief, doesn, think, carbon, dioxide, main, cause, global, warming, wait, what, https, yeLvcEFXkC]
1                                                               [like, lack, evidence, anthropogenic, global, warming]
2                           [Researchers, have, three, years, climate, change, before, late, https, KdUr, https, ANPT]
3                                             [#TodayinMaker#, WIRED, pivotal, year, climate, change, https, wOTxTLcD]
4                                    [racist, sexist, climate, change, denying, bigot, leading, polls, #ElectionNight]
Name: tidy_tweet, dtype: object

In [44]:
# use stemming (gives two meaning words)
from nltk.stem.porter import *
stemmer = PorterStemmer()

test_tokenized_tweet = test_tokenized_tweet.apply(lambda x: [stemmer.stem(i) for i in x]) # stemming

In [45]:
# use stemming (gives two meaning words)
from nltk.stem.porter import *
stemmer = PorterStemmer()

train_tokenized_tweet = train_tokenized_tweet.apply(lambda x: [stemmer.stem(i) for i in x]) # stemming

In [46]:
#change to sentances again after stemming was complete
for i in range(len(test_tokenized_tweet)):
    test_tokenized_tweet[i] = ' '.join(test_tokenized_tweet[i])
    
test['tidy_tweet'] = test_tokenized_tweet


In [47]:
#change to sentances again after stemming was complete
for i in range(len(train_tokenized_tweet)):
    train_tokenized_tweet[i] = ' '.join(train_tokenized_tweet[i])
    
train['tidy_tweet'] = train_tokenized_tweet

In [48]:
#importing word to vector model
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec

In [49]:
train_tokenized_tweet = train['tidy_tweet'].apply(lambda x: x.split()) # tokenizing

model_w2v = Word2Vec(train_tokenized_tweet, size=200,  window=5)

model_w2v.train(train_tokenized_tweet, total_examples= len(train['tidy_tweet']), epochs=20)

(2224921, 3557100)

In [51]:
test_tokenized_tweet = test['tidy_tweet'].apply(lambda x: x.split()) # tokenizing

model_w2v = Word2Vec(test_tokenized_tweet, size=200,  window=5)

model_w2v.train(test_tokenized_tweet, total_examples= len(test['tidy_tweet']), epochs=20)

(1438614, 2368440)

In [52]:
def word_vector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += model_w2v[word].reshape((1, size))
            count += 1.
        except KeyError: # handling the case where the token is not in vocabulary
                         
            continue
    if count != 0:
        vec /= count
    return vec

In [54]:
#
wordvec_arrays = np.zeros((len(train_tokenized_tweet), 200))

for i in range(len(train_tokenized_tweet)):
    wordvec_arrays[i,:] = word_vector(train_tokenized_tweet[i], 200)
    
wordvec_df = pd.DataFrame(wordvec_arrays)
wordvec_df.shape

(15819, 200)

In [55]:
from sklearn.model_selection import train_test_split


train_w2v = wordvec_df.iloc[:31962,:]
test_w2v = wordvec_df.iloc[31962:,:]

_, _, ytrain, yvalid = train_test_split(train_w2v, train['sentiment'],  
                                                          random_state=42, 
                                                          test_size=0.3)



print(train_w2v.shape, train['sentiment'].shape)

xtrain_w2v = train_w2v.iloc[ytrain.index,:]
xvalid_w2v = train_w2v.iloc[yvalid.index,:]

(15819, 200) (15819,)


In [98]:
#Split the train data into train and validation 

from sklearn.model_selection import train_test_split

X = train["tidy_tweet"]
y = train['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

In [99]:
from sklearn.svm import LinearSVC
clf = LinearSVC()
clf.fit(X_train_tfidf,y_train)

ValueError: Found input variables with inconsistent numbers of samples: [12655, 13446]

In [100]:
from sklearn.pipeline import Pipeline
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.svm import LinearSVC

text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LinearSVC()),
])

# Feed the training data through the pipeline
text_clf.fit(X_train, y_train) 

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])

In [101]:
# Form a prediction set
predictions = text_clf.predict(X_test)

In [102]:
# Report the confusion matrix
from sklearn import metrics
print(metrics.confusion_matrix(y_test,predictions))

[[  82   28   88    8]
 [  21  126  145   21]
 [  22   79 1097  127]
 [   3   13   97  416]]


In [103]:
# Print a classification report
print(metrics.classification_report(y_test,predictions))

              precision    recall  f1-score   support

          -1       0.64      0.40      0.49       206
           0       0.51      0.40      0.45       313
           1       0.77      0.83      0.80      1325
           2       0.73      0.79      0.76       529

    accuracy                           0.73      2373
   macro avg       0.66      0.60      0.62      2373
weighted avg       0.71      0.73      0.72      2373



In [104]:
# Print the overall accuracy
print(metrics.accuracy_score(y_test,predictions))

0.7252423093131057
