## Import libraries

In [438]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
import re
re.compile('<title>(.*)</title>')

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import f1_score


## Import the dataset

In [439]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [440]:
train.sentiment.value_counts()

 1    8530
 2    3640
 0    2353
-1    1296
Name: sentiment, dtype: int64

In [441]:
train.tail()

Unnamed: 0,sentiment,message,tweetid
15814,1,RT @ezlusztig: They took down the material on ...,22001
15815,2,RT @washingtonpost: How climate change could b...,17856
15816,0,notiven: RT: nytimesworld :What does Trump act...,384248
15817,-1,RT @sara8smiles: Hey liberals the climate chan...,819732
15818,0,RT @Chet_Cannon: .@kurteichenwald's 'climate c...,806319


# Cleaning the Data

In [442]:
#Function to clean out any noise from the data
def standardize_text(df, text_field):
    df[text_field] = df[text_field].str.replace(r"http\S+", "")
    df[text_field] = df[text_field].str.replace(r"http", "")
    df[text_field] = df[text_field].str.replace(r"@\S+", "")
    df[text_field] = df[text_field].str.replace(r"[^A-Za-z0-9(),!?@\'\`\"\_\n]", " ")
    df[text_field] = df[text_field].str.replace(r"@", "")
    df[text_field] = df[text_field].str.lower()
    return df

# Cleaning noise from train set
train = standardize_text(train, 'message')

# Cleaning noise from test set
test = standardize_text(test, 'message')

# Removing Punctuation

In [443]:
import string

# Function to remove all punctuation (,?!.)
def remove_punctuation(post):
    return ''.join([l for l in post if l not in string.punctuation])

# Cleaning punctuation on train set
train['message'] = train['message'].apply(remove_punctuation)

# Cleaning punctuation on test set
test['message'] = test['message'].apply(remove_punctuation)

# Spliting joint words with word ninja

In [444]:
import wordninja

# Function to split join word in train set
sentences = []
def attached_words(df, message):
    for word in message:
        word = wordninja.split(word)
        sentences.append(" ".join(word))
    df['message'] = sentences
    return df

# Spliting words on the train set
train = attached_words(train,train['message'])


In [445]:
# Function to split join word in test set

sentences1 = []
def attached_words(df1, message1):
    for word1 in message1:
        word1 = wordninja.split(word1)
        sentences1.append(" ".join(word1))
    df1['message'] = sentences1
    return df1

# Spliting words on the test set
test = attached_words(test,test['message'])

In [446]:
# Function to replace RT with Retweet
def retweets_text(df, text_field):
    df[text_field] = df[text_field].str.replace(r"rt ","retweet ")
    return df

# Replace RT with Retweet on the train set
train = retweets_text(train, 'message')

# Replace RT with Retweet on the test set
test = retweets_text(test, 'message')

# Removing Stopwords

In [447]:
from nltk.corpus import stopwords
stopwords=set(stopwords.words('english'))

# words to exclude from the original stopwords set
exclude_words = set(("couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 
                     'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 
                     'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan',
                     "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 
                     'won', "won't", 'wouldn', "wouldn't", 'not', "aren't", "don't"))

# calc to excluding the above words from original list
new_stopwords = stopwords - exclude_words

# creating a list from train messages
messages_train = train['message'].to_numpy()
# creating a list from test messages
messages_test = test['message'].to_numpy()

# function to remove stopwords from messages
def remove_stopwords(messages):
    output_array=[]
    for sentence in messages:
        temp_list=[]
        for word in sentence.split():
            if word.lower() not in new_stopwords:
                temp_list.append(word)
        output_array.append(' '.join(temp_list))
    return output_array

In [448]:
# applying stopword funtion to train set
train['message'] = remove_stopwords(messages_train)
# applying stopword funtion to test set
test['message'] = remove_stopwords(messages_test)

In [449]:
train.head()

Unnamed: 0,sentiment,message,tweetid
0,1,poly sci major epa chief doesnt think carbon d...,625221
1,1,not like lack evidence anthropogenic global wa...,126103
2,2,retweet researchers say three years act climat...,698562
3,1,today maker wired 2016 pivotal year war climat...,573736
4,1,retweet 2016 racist sexist climate change deny...,466954


In [450]:
test.head()

Unnamed: 0,message,tweetid
0,europe looking china make sure not alone fight...,169760
1,combine polling staffers climate change womens...,35326
2,scary unimpeachable evidence climate change al...,224985
3,put got jill trump doesnt believe climate chan...,476263
4,retweet female orgasms cause global warming sa...,872928


# Most Frequent Words

In [451]:
most_freq = pd.Series(' '.join(train['message']).split()).value_counts()[:10]
most_freq

climate    13467
change     12945
retweet     9715
global      3884
warming     3598
trump       2167
believe     1161
not         1101
amp          940
doesnt       813
dtype: int64

# Less Frequent Words

In [452]:
less_freq = pd.Series(' '.join(train['message']).split()).value_counts()[-10:]
less_freq

symmetry      1
nigh          1
accomplice    1
salman        1
boring        1
espanol       1
wes           1
feb           1
ramsey        1
blades        1
dtype: int64

## Tokens 

In [453]:
from nltk.tokenize import TreebankWordTokenizer, word_tokenize
tokeniser = TreebankWordTokenizer()

#tokenizing the train set
train['tokens'] = train['message'].apply(tokeniser.tokenize)
#tokenizing the test set
test['tokens'] = test['message'].apply(tokeniser.tokenize)

## Stemming

In [454]:
from nltk import SnowballStemmer, PorterStemmer, LancasterStemmer
snowball_stemmer = SnowballStemmer('english')
porter_stemmer = PorterStemmer()
lancaster_stemmer = LancasterStemmer()

# function for stemming
def dataset_stem(words, stemmer):
    return [snowball_stemmer.stem(word) for word in words]

# applying stemming to the train set
train['snowball_stemmer'] = train['tokens'].apply(dataset_stem, args=(snowball_stemmer,))
train['porter_stemmer'] = train['tokens'].apply(dataset_stem, args=(porter_stemmer,))
train['lancaster_stemmer'] = train['tokens'].apply(dataset_stem, args=(lancaster_stemmer,))

# applying stemming to the test set
test['snowball_stemmer'] = test['tokens'].apply(dataset_stem, args=(snowball_stemmer,))
test['porter_stemmer'] = test['tokens'].apply(dataset_stem, args=(porter_stemmer,))
test['lancaster_stemmer'] = test['tokens'].apply(dataset_stem, args=(lancaster_stemmer,))

# Lemmatization

In [455]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# Function for Lemmatizer
def dataset_lemma(words, lemmatizer):
    return [lemmatizer.lemmatize(word) for word in words]   

#Applying lemmatizer of train set
train['lemma'] = train['tokens'].apply(dataset_lemma, args=(lemmatizer, ))

#Applying lemmatizer of train set
test['lemma'] = test['tokens'].apply(dataset_lemma, args=(lemmatizer, ))

# Joining words in stemming and Lemma

In [456]:
# joining words for the snowball stemming on the train set
snowball_stem_sentence_train = []
for words in train['snowball_stemmer']:
    snowball_stem_sentence_train.append(' '.join(words))

# joining words for the porter stemming on the train set
porter_stem_sentence_train = []
for words in train['porter_stemmer']:
    porter_stem_sentence_train.append(' '.join(words))

# joining words for the lancaster stemming on the train set
lancaster_stem_sentence_train = []
for words in train['lancaster_stemmer']:
    lancaster_stem_sentence_train.append(' '.join(words))

# joining words for the lemma on the train set
lemma_sentence_train = []
for words in train['lemma']:
    lemma_sentence_train.append(' '.join(words))

# setting lists to the dataframe column
train['snowball_stemmer'] = snowball_stem_sentence_train
train['porter_stemmer'] = porter_stem_sentence_train
train['lancaster_stemmer'] = lancaster_stem_sentence_train
train['lemma'] = lemma_sentence_train
    

# joining words for the snowball stemming on the test set
snowball_stem_sentence_test = []
for words in test['snowball_stemmer']:
    snowball_stem_sentence_test.append(' '.join(words))

# joining words for the porter stemming on the test set
porter_stem_sentence_test = []
for words in test['porter_stemmer']:
    porter_stem_sentence_test.append(' '.join(words))

# joining words for the lancaster stemming on the test set
lancaster_stem_sentence_test = []
for words in test['lancaster_stemmer']:
    lancaster_stem_sentence_test.append(' '.join(words))

# joining words for the lemma on the test set
lemma_sentence_test = []
for words in test['lemma']:
    lemma_sentence_test.append(' '.join(words))

# setting lists to the dataframe column
test['snowball_stemmer'] = snowball_stem_sentence_test
test['porter_stemmer'] = porter_stem_sentence_test
test['lancaster_stemmer'] = lancaster_stem_sentence_test
test['lemma'] = lemma_sentence_test


In [457]:
train.head()

Unnamed: 0,sentiment,message,tweetid,tokens,snowball_stemmer,porter_stemmer,lancaster_stemmer,lemma
0,1,poly sci major epa chief doesnt think carbon d...,625221,"[poly, sci, major, epa, chief, doesnt, think, ...",poli sci major epa chief doesnt think carbon d...,poli sci major epa chief doesnt think carbon d...,poli sci major epa chief doesnt think carbon d...,poly sci major epa chief doesnt think carbon d...
1,1,not like lack evidence anthropogenic global wa...,126103,"[not, like, lack, evidence, anthropogenic, glo...",not like lack evid anthropogen global warm,not like lack evid anthropogen global warm,not like lack evid anthropogen global warm,not like lack evidence anthropogenic global wa...
2,2,retweet researchers say three years act climat...,698562,"[retweet, researchers, say, three, years, act,...",retweet research say three year act climat cha...,retweet research say three year act climat cha...,retweet research say three year act climat cha...,retweet researcher say three year act climate ...
3,1,today maker wired 2016 pivotal year war climat...,573736,"[today, maker, wired, 2016, pivotal, year, war...",today maker wire 2016 pivot year war climat chang,today maker wire 2016 pivot year war climat chang,today maker wire 2016 pivot year war climat chang,today maker wired 2016 pivotal year war climat...
4,1,retweet 2016 racist sexist climate change deny...,466954,"[retweet, 2016, racist, sexist, climate, chang...",retweet 2016 racist sexist climat chang deni b...,retweet 2016 racist sexist climat chang deni b...,retweet 2016 racist sexist climat chang deni b...,retweet 2016 racist sexist climate change deny...


In [458]:
test.head()

Unnamed: 0,message,tweetid,tokens,snowball_stemmer,porter_stemmer,lancaster_stemmer,lemma
0,europe looking china make sure not alone fight...,169760,"[europe, looking, china, make, sure, not, alon...",europ look china make sure not alon fight clim...,europ look china make sure not alon fight clim...,europ look china make sure not alon fight clim...,europe looking china make sure not alone fight...
1,combine polling staffers climate change womens...,35326,"[combine, polling, staffers, climate, change, ...",combin poll staffer climat chang women right f...,combin poll staffer climat chang women right f...,combin poll staffer climat chang women right f...,combine polling staffer climate change woman r...
2,scary unimpeachable evidence climate change al...,224985,"[scary, unimpeachable, evidence, climate, chan...",scari unimpeach evid climat chang alreadi time...,scari unimpeach evid climat chang alreadi time...,scari unimpeach evid climat chang alreadi time...,scary unimpeachable evidence climate change al...
3,put got jill trump doesnt believe climate chan...,476263,"[put, got, jill, trump, doesnt, believe, clima...",put got jill trump doesnt believ climat chang ...,put got jill trump doesnt believ climat chang ...,put got jill trump doesnt believ climat chang ...,put got jill trump doesnt believe climate chan...
4,retweet female orgasms cause global warming sa...,872928,"[retweet, female, orgasms, cause, global, warm...",retweet femal orgasm caus global warm sarcast ...,retweet femal orgasm caus global warm sarcast ...,retweet femal orgasm caus global warm sarcast ...,retweet female orgasm cause global warming sar...


## Splitting out the X variable from the target

In [459]:
y = train['sentiment']
X = train['lemma']

## Turning text into something your model can read

In [509]:
vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=2)
X_vectorized = vectorizer.fit_transform(X)

## Splitting the training data into a training and validation set

In [513]:
X_train,X_val,y_train,y_val = train_test_split(X_vectorized,y,test_size=0.2,shuffle=True, stratify=y, random_state=32)

## Creating a function to measure best model

In [514]:
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score

def model_selection(model, X_train, X_val, y_train, y_val):
    model.fit(X_train,y_train)
    y_pred = model.predict(X_val)
    return print(f1_score(y_val, y_pred, average='macro'))

# Logistic Regression

In [463]:
model_selection(LogisticRegression(), X_train, X_val, y_train, y_val)

0.5629160864227646


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


# Naiive Bayes

In [464]:
model_selection(MultinomialNB(), X_train, X_val, y_train, y_val)

0.4474891743978757


# Random Forest

In [465]:
model_selection(RandomForestClassifier(), X_train, X_val, y_train, y_val)

0.5312875587452006


# Adaboost Classifier

In [466]:
model_selection(AdaBoostClassifier(), X_train, X_val, y_train, y_val)

0.4751660956260282


# Linear SVC

In [518]:
rfc = LinearSVC(dual= True,fit_intercept= False,loss= 'hinge',multi_class= 'ovr',penalty= 'l2')
model_selection(rfc, X_train, X_val, y_train, y_val)

#0.6635216619974396

0.6635216619974396


In [502]:
from sklearn.model_selection import GridSearchCV
parameters=[{'loss':['hinge'], 'fit_intercept':[False], 'class_weight':['balanced'], 'random_state': [1]}]

gscv=GridSearchCV(rfc,parameters,scoring='f1_macro',n_jobs=-1,cv=5)
grid_search=gscv.fit(X_vectorized,y)

In [503]:
grid_search.best_params_

{'class_weight': 'balanced',
 'fit_intercept': False,
 'loss': 'hinge',
 'random_state': 1}

In [504]:
grid_search.best_score_

#0.6471141084993387

0.6471141084993387

# XGBoost

In [468]:
model_selection(XGBClassifier(), X_train, X_val, y_train, y_val)

0.5606369527290738


# KNeighbours

In [469]:
model_selection(KNeighborsClassifier(), X_train, X_val, y_train, y_val)

0.5507078442077513


# Support Vector

In [470]:
model_selection(SVC(kernel = 'linear'), X_train, X_val, y_train, y_val)

0.6175817855364216


# Decision Tree

In [471]:
model_selection(DecisionTreeClassifier(), X_train, X_val, y_train, y_val)

0.5086789983725637


In [519]:
from sklearn.model_selection import cross_val_score
rfc = LinearSVC(dual= True,fit_intercept= False,loss= 'hinge',multi_class= 'ovr',penalty= 'l2')
rfc.fit(X_vectorized, y)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=False,
          intercept_scaling=1, loss='hinge', max_iter=1000, multi_class='ovr',
          penalty='l2', random_state=None, tol=0.0001, verbose=0)

In [520]:
test.head()

Unnamed: 0,message,tweetid,tokens,snowball_stemmer,porter_stemmer,lancaster_stemmer,lemma,sentiment
0,europe looking china make sure not alone fight...,169760,"[europe, looking, china, make, sure, not, alon...",europ look china make sure not alon fight clim...,europ look china make sure not alon fight clim...,europ look china make sure not alon fight clim...,europe looking china make sure not alone fight...,1
1,combine polling staffers climate change womens...,35326,"[combine, polling, staffers, climate, change, ...",combin poll staffer climat chang women right f...,combin poll staffer climat chang women right f...,combin poll staffer climat chang women right f...,combine polling staffer climate change woman r...,1
2,scary unimpeachable evidence climate change al...,224985,"[scary, unimpeachable, evidence, climate, chan...",scari unimpeach evid climat chang alreadi time...,scari unimpeach evid climat chang alreadi time...,scari unimpeach evid climat chang alreadi time...,scary unimpeachable evidence climate change al...,1
3,put got jill trump doesnt believe climate chan...,476263,"[put, got, jill, trump, doesnt, believe, clima...",put got jill trump doesnt believ climat chang ...,put got jill trump doesnt believ climat chang ...,put got jill trump doesnt believ climat chang ...,put got jill trump doesnt believe climate chan...,1
4,retweet female orgasms cause global warming sa...,872928,"[retweet, female, orgasms, cause, global, warm...",retweet femal orgasm caus global warm sarcast ...,retweet femal orgasm caus global warm sarcast ...,retweet femal orgasm caus global warm sarcast ...,retweet female orgasm cause global warming sar...,0


## Getting our test set ready 

In [533]:
testx = test['lemma']
test_vect = vectorizer.transform(testx)

## Making predictions on the test set and adding a sentiment column to our original test df

In [534]:
y_pred = rfc.predict(test_vect)

In [535]:
test['sentiment'] = y_pred

## Creating an output csv for submission

In [536]:
test[['tweetid','sentiment']].to_csv('testsubmission.csv', index=False)