## Import libraries

In [57]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
import re
re.compile('<title>(.*)</title>')

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import f1_score


## Import the dataset

In [58]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [59]:
train.sentiment.value_counts()

 1    8530
 2    3640
 0    2353
-1    1296
Name: sentiment, dtype: int64

In [60]:
train.head()

Unnamed: 0,sentiment,message,tweetid
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221
1,1,It's not like we lack evidence of anthropogeni...,126103
2,2,RT @RawStory: Researchers say we have three ye...,698562
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year...,573736
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",466954


# Cleaning the Data

In [61]:
def standardize_text(df, text_field):
    df[text_field] = df[text_field].str.replace(r"http\S+", "")
    df[text_field] = df[text_field].str.replace(r"http", "")
    df[text_field] = df[text_field].str.replace(r"@\S+", "")
    df[text_field] = df[text_field].str.replace(r"[^A-Za-z0-9(),!?@\'\`\"\_\n]", " ")
    df[text_field] = df[text_field].str.replace(r"@", "")
    df[text_field] = df[text_field].str.replace(r"RT ","Retweet ")
    df[text_field] = df[text_field].str.lower()
    return df

train = standardize_text(train, 'message')

# Removing Punctuation

In [62]:
import string

def remove_punctuation(post):
    return ''.join([l for l in post if l not in string.punctuation])

train['message'] = train['message'].apply(remove_punctuation)
train.head()

Unnamed: 0,sentiment,message,tweetid
0,1,PolySciMajor EPA chief doesnt think carbon dio...,625221
1,1,Its not like we lack evidence of anthropogenic...,126103
2,2,Retweet Researchers say we have three years t...,698562
3,1,TodayinMaker WIRED 2016 was a pivotal year...,573736
4,1,Retweet Its 2016 and a racist sexist climate ...,466954


# Removing Stopwords

In [63]:
from nltk.corpus import stopwords
stopwords=set(stopwords.words('english'))
exclude_words = set(("couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 
                     'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 
                     'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan',
                     "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 
                     'won', "won't", 'wouldn', "wouldn't", 'not', "aren't", "don't"))

new_stopwords = stopwords - exclude_words

messages = train['message'].to_numpy()

def remove_stopwords(messages):
    output_array=[]
    for sentence in messages:
        temp_list=[]
        for word in sentence.split():
            if word.lower() not in new_stopwords:
                temp_list.append(word)
        output_array.append(' '.join(temp_list))
    return output_array

In [64]:
train['message'] = remove_stopwords(messages)
train[:10]

Unnamed: 0,sentiment,message,tweetid
0,1,PolySciMajor EPA chief doesnt think carbon dio...,625221
1,1,not like lack evidence anthropogenic global wa...,126103
2,2,Retweet Researchers say three years act climat...,698562
3,1,TodayinMaker WIRED 2016 pivotal year war clima...,573736
4,1,Retweet 2016 racist sexist climate change deny...,466954
5,1,Worth read whether dont believe climate change,425577
6,1,Retweet Mike Pence doesn believe global warmin...,294933
7,1,Retweet Six big things today fight climate cha...,992717
8,1,8yo nephew inconsolable wants die old age like...,664510
9,1,Retweet offense like not believe global warming,260471


# Most Frequent Words

In [65]:
most_freq = pd.Series(' '.join(train['message']).split()).value_counts()[:10]
most_freq

climate    12838
change     12660
Retweet     9724
global      3707
warming     3516
Trump       1985
believe     1158
not          977
amp          940
doesnt       802
dtype: int64

# Less Frequent Words

In [66]:
less_freq = pd.Series(' '.join(train['message']).split()).value_counts()[-10:]
less_freq

Hank                 1
Dies                 1
treading             1
Room                 1
kiss                 1
Fishel               1
science2016electi    1
inve                 1
pts                  1
sinc                 1
dtype: int64

## Tokens 

In [67]:
from nltk.tokenize import TreebankWordTokenizer, word_tokenize

tokeniser = TreebankWordTokenizer()
train['tokens'] = train['message'].apply(tokeniser.tokenize)
train.head()

Unnamed: 0,sentiment,message,tweetid,tokens
0,1,PolySciMajor EPA chief doesnt think carbon dio...,625221,"[PolySciMajor, EPA, chief, doesnt, think, carb..."
1,1,not like lack evidence anthropogenic global wa...,126103,"[not, like, lack, evidence, anthropogenic, glo..."
2,2,Retweet Researchers say three years act climat...,698562,"[Retweet, Researchers, say, three, years, act,..."
3,1,TodayinMaker WIRED 2016 pivotal year war clima...,573736,"[TodayinMaker, WIRED, 2016, pivotal, year, war..."
4,1,Retweet 2016 racist sexist climate change deny...,466954,"[Retweet, 2016, racist, sexist, climate, chang..."


## Stemming

In [72]:
from nltk import SnowballStemmer, PorterStemmer, LancasterStemmer
stemmer = SnowballStemmer('english')

def dataset_stem(words, stemmer):
    return [stemmer.stem(word) for word in words]

train['stem'] = train['tokens'].apply(dataset_stem, args=(stemmer,))
train.head()

Unnamed: 0,sentiment,message,tweetid,tokens,stem
0,1,polyscimajor epa chief doesnt think carbon dio...,625221,"[polyscimajor, epa, chief, doesnt, think, carb...","[polyscimajor, epa, chief, doesnt, think, carb..."
1,1,not like lack evidence anthropogenic global wa...,126103,"[not, like, lack, evidence, anthropogenic, glo...","[not, like, lack, evid, anthropogen, global, w..."
2,2,retweet researchers say three years act climat...,698562,"[retweet, researchers, say, three, years, act,...","[retweet, research, say, three, year, act, cli..."
3,1,todayinmaker wired 2016 pivotal year war clima...,573736,"[todayinmaker, wired, 2016, pivotal, year, war...","[todayinmak, wire, 2016, pivot, year, war, cli..."
4,1,retweet 2016 racist sexist climate change deny...,466954,"[retweet, 2016, racist, sexist, climate, chang...","[retweet, 2016, racist, sexist, climat, chang,..."


# Lemmatization

In [73]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def dataset_lemma(words, lemmatizer):
    return [lemmatizer.lemmatize(word) for word in words]   

train['lemma'] = train['tokens'].apply(dataset_lemma, args=(lemmatizer, ))
train.head()

Unnamed: 0,sentiment,message,tweetid,tokens,stem,lemma
0,1,polyscimajor epa chief doesnt think carbon dio...,625221,"[polyscimajor, epa, chief, doesnt, think, carb...","[polyscimajor, epa, chief, doesnt, think, carb...","[polyscimajor, epa, chief, doesnt, think, carb..."
1,1,not like lack evidence anthropogenic global wa...,126103,"[not, like, lack, evidence, anthropogenic, glo...","[not, like, lack, evid, anthropogen, global, w...","[not, like, lack, evidence, anthropogenic, glo..."
2,2,retweet researchers say three years act climat...,698562,"[retweet, researchers, say, three, years, act,...","[retweet, research, say, three, year, act, cli...","[retweet, researcher, say, three, year, act, c..."
3,1,todayinmaker wired 2016 pivotal year war clima...,573736,"[todayinmaker, wired, 2016, pivotal, year, war...","[todayinmak, wire, 2016, pivot, year, war, cli...","[todayinmaker, wired, 2016, pivotal, year, war..."
4,1,retweet 2016 racist sexist climate change deny...,466954,"[retweet, 2016, racist, sexist, climate, chang...","[retweet, 2016, racist, sexist, climat, chang,...","[retweet, 2016, racist, sexist, climate, chang..."


# Joining words in stemming and Lemma

In [74]:
lemma_sentence = []
for words in train['lemma']:
    lemma_sentence.append(' '.join(words))

stem_sentence = []
for words in train['stem']:
    stem_sentence.append(' '.join(words))
    
train['lemma'] = lemma_sentence
train['stem'] = stem_sentence

train.head()

Unnamed: 0,sentiment,message,tweetid,tokens,stem,lemma
0,1,polyscimajor epa chief doesnt think carbon dio...,625221,"[polyscimajor, epa, chief, doesnt, think, carb...",polyscimajor epa chief doesnt think carbon dio...,polyscimajor epa chief doesnt think carbon dio...
1,1,not like lack evidence anthropogenic global wa...,126103,"[not, like, lack, evidence, anthropogenic, glo...",not like lack evid anthropogen global warm,not like lack evidence anthropogenic global wa...
2,2,retweet researchers say three years act climat...,698562,"[retweet, researchers, say, three, years, act,...",retweet research say three year act climat cha...,retweet researcher say three year act climate ...
3,1,todayinmaker wired 2016 pivotal year war clima...,573736,"[todayinmaker, wired, 2016, pivotal, year, war...",todayinmak wire 2016 pivot year war climat chang,todayinmaker wired 2016 pivotal year war clima...
4,1,retweet 2016 racist sexist climate change deny...,466954,"[retweet, 2016, racist, sexist, climate, chang...",retweet 2016 racist sexist climat chang deni b...,retweet 2016 racist sexist climate change deny...


## Splitting out the X variable from the target

In [75]:
y = train['sentiment']
X = train['stem']

## Turning text into something your model can read

In [76]:
vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=2)
X_vectorized = vectorizer.fit_transform(X)

## Splitting the training data into a training and validation set

In [77]:
X_train,X_val,y_train,y_val = train_test_split(X_vectorized,y,test_size=0.2,shuffle=True, stratify=y, random_state=11)

## Creating a function to measure best model

In [78]:
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score

def model_selection(model, X_train, X_val, y_train, y_val):
    model.fit(X_train,y_train)
    y_pred = model.predict(X_val)
    return print(f1_score(y_val, y_pred, average='macro'))

# Logistic Regression

In [246]:
model_selection(LogisticRegression(), X_train, X_val, y_train, y_val)

0.5876026123084404


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


# Naiive Bayes

In [247]:
model_selection(MultinomialNB(), X_train, X_val, y_train, y_val)

0.45924109081425507


# Random Forest

In [248]:
model_selection(RandomForestClassifier(), X_train, X_val, y_train, y_val)

0.5576799864496926


# Adaboost Classifier

In [249]:
model_selection(AdaBoostClassifier(), X_train, X_val, y_train, y_val)

0.47913283189835626


# Linear SVC

In [85]:
rfc = LinearSVC(C=0.3, fit_intercept=False, random_state=1)
model_selection(rfc, X_train, X_val, y_train, y_val)

{'C': 0.3, 'fit_intercept': False, 'random_state': 1}

0.6285113650693761


{'C': 0.3, 'fit_intercept': False, 'random_state': 1}

# Linear SVC with Grid Search

In [81]:
from sklearn.model_selection import GridSearchCV
rfc = LinearSVC()
parameters=[{'C':[0.1,0.2,0.3],'fit_intercept':[True, False],'random_state':[1,2,3,4,5,6]}]

gscv = GridSearchCV(rfc,parameters,scoring='f1_macro',n_jobs=-1,cv=5)
grid_search=gscv.fit(X_vectorized,y)

#penalty='l2',
#loss='squared_hinge',
#dual=True,
#tol=0.0001,
#C=1.0,
#multi_class='ovr',
#fit_intercept=True,
#intercept_scaling=1,
#class_weight=None,
#verbose=0,
#random_state=None,
#max_iter=1000,

In [82]:
grid_search.best_params_

{'C': 0.3, 'fit_intercept': False, 'random_state': 1}

# XGBoost

In [251]:
model_selection(XGBClassifier(), X_train, X_val, y_train, y_val)

0.5641932390457639


# KNeighbours

In [252]:
model_selection(KNeighborsClassifier(), X_train, X_val, y_train, y_val)

0.5291337551408124


# Support Vector

In [253]:
model_selection(SVC(), X_train, X_val, y_train, y_val)

0.5754640429191721


# Decision Tree

In [254]:
model_selection(DecisionTreeClassifier(), X_train, X_val, y_train, y_val)

0.514376125007167


In [86]:
from sklearn.model_selection import cross_val_score
rfc = LinearSVC(C=0.3, fit_intercept=False, random_state=1)
rfc.fit(X_vectorized, y)

LinearSVC(C=0.3, class_weight=None, dual=True, fit_intercept=False,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=1, tol=0.0001,
          verbose=0)

In [94]:
test = standardize_text(test, 'message')

In [95]:
test['message'] = test['message'].apply(remove_punctuation)

In [96]:
test['tokens'] = test['message'].apply(tokeniser.tokenize)

In [97]:
test['stem'] = test['tokens'].apply(dataset_stem, args=(stemmer,))

In [98]:
test['lemma'] = test['tokens'].apply(dataset_lemma, args=(lemmatizer, ))

In [99]:
lemma_sentence_1 = []
for words in test['lemma']:
    lemma_sentence_1.append(' '.join(words))

stem_sentence_1 = []
for words in test['stem']:
    stem_sentence_1.append(' '.join(words))
    
test['lemma'] = lemma_sentence_1
test['stem'] = stem_sentence_1

test.head()

Unnamed: 0,message,tweetid,tokens,stem,lemma
0,europe will now be looking to china to make su...,169760,"[europe, will, now, be, looking, to, china, to...",europ will now be look to china to make sure t...,europe will now be looking to china to make su...
1,combine this with the polling of staffers re c...,35326,"[combine, this, with, the, polling, of, staffe...",combin this with the poll of staffer re climat...,combine this with the polling of staffer re cl...
2,the scary unimpeachable evidence that climate ...,224985,"[the, scary, unimpeachable, evidence, that, cl...",the scari unimpeach evid that climat chang is ...,the scary unimpeachable evidence that climate ...
3,\nputin got to you too jill \ntrump doesn...,476263,"[putin, got, to, you, too, jill, trump, doesnt...",putin got to you too jill trump doesnt believ ...,putin got to you too jill trump doesnt believe...
4,retweet female orgasms cause global warming\n...,872928,"[retweet, female, orgasms, cause, global, warm...",retweet femal orgasm caus global warm sarcast ...,retweet female orgasm cause global warming sar...


## Getting our test set ready 

In [100]:
testx = test['stem']
test_vect = vectorizer.transform(testx)

## Making predictions on the test set and adding a sentiment column to our original test df

In [101]:
y_pred = rfc.predict(test_vect)

In [102]:
test['sentiment'] = y_pred

## Creating an output csv for submission

In [103]:
test[['tweetid','sentiment']].to_csv('testsubmission.csv', index=False)