## Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
import re
re.compile('<title>(.*)</title>')

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import f1_score


## Import the dataset

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train.sentiment.value_counts()

 1    8530
 2    3640
 0    2353
-1    1296
Name: sentiment, dtype: int64

# Cleaning the Data

In [4]:
#Function to clean out any noise from the data
def standardize_text(df, text_field):
    
    df[text_field] = df[text_field].str.replace(r"http\S+", "")
    df[text_field] = df[text_field].str.replace(r"http", "")
    df[text_field] = df[text_field].str.replace(r'&amp;?',r'and')
    df[text_field] = df[text_field].str.replace(r'&lt;',r'<')
    df[text_field] = df[text_field].str.replace(r'&gt;',r'>')
    df[text_field] = df[text_field].str.replace(r"@\S+", "")
    df[text_field] = df[text_field].str.replace(r"[^A-Za-z0-9(),!?@\'\`\"\_\n]", " ")
    df[text_field] = df[text_field].str.replace(r"@", "")
    df[text_field] = df[text_field].str.lower()
    return df

# Cleaning noise from train set
train = standardize_text(train, 'message')

# Cleaning noise from test set
test = standardize_text(test, 'message')

# Replace apostrophe/short words in python

# Removing Punctuation

In [5]:
import string

# Function to remove all punctuation (,?!.)
def remove_punctuation(post):
    return ''.join([l for l in post if l not in string.punctuation])

# Cleaning punctuation on train set
train['new message'] = train['message'].apply(remove_punctuation)

# Cleaning punctuation on test set
test['new message'] = test['message'].apply(remove_punctuation)

# Spliting joint words with word ninja

In [6]:
import wordninja

# Function to split join word in train set
sentences = []
def attached_words(df, message):
    for word in message:
        word = wordninja.split(word)
        sentences.append(" ".join(word))
    df['new message'] = sentences
    return df

# Spliting words on the train set
train = attached_words(train,train['new message'])


In [7]:
# Function to split join word in test set

sentences1 = []
def attached_words(df1, message1):
    for word1 in message1:
        word1 = wordninja.split(word1)
        sentences1.append(" ".join(word1))
    df1['new message'] = sentences1
    return df1

# Spliting words on the test set
test = attached_words(test,test['new message'])

In [8]:
# Function to replace RT with Retweet
def retweets_text(df, text_field):
    df[text_field] = df[text_field].str.replace(r"rt ","retweet ")
    return df

# Replace RT with Retweet on the train set
train = retweets_text(train, 'new message')

# Replace RT with Retweet on the test set
test = retweets_text(test, 'new message')

# Removing Stopwords

In [9]:

custom_stopwords = ['to', 'through', 'if', 'yours', 'such', 'your', 'i', 'because', 'at', 'now', 'her', 'it', 
                    'weren', 'don', 'were', 'all', 'above', 'once', 'any', 'as', "you'd", 'but', 'did', 
                    'the', "it's", 'them', "should've", 'down', 'they', 'in', 'below', 'she', 'up', 
                    'my', 'doing', 'themselves', 'which', 'you', 'into', 'each', 'very', 
                    'ourselves', 'yourselves', 'then', 'ain', 'should', 'most', "you're", 
                    'this', 'there', 'yourself', 'where', 'with', 'about', 
                    'from', 'so', 'do', "that'll", 'same', 'some', 'what', 'too', 'further', 
                    'before', 'having', 'their', 'out', 'those', 'of', 'few', 'than', 
                    'our', 'is', 'against', 'off', 'by', 'can', 'are', 'will', 'and', 
                    'when', 'hers', "you've", 'being', 'for', 'just', 'between', 'here', 'have', 
                    'myself', 'been', 'am', 'own', 'no', 'needn', 'd', 'whom', 'during', 
                    'only', 'or', 'after', 'until', 'be', 'has', 'on', 'other', "you'll", 'we', 'more', 
                    'a', "she's", 'again', 've', 'he', 'himself', 'was', 'over', 'me', 'its', 
                    'itself', 'theirs', 'these', 'had', 'who', 'while', 'that', 'how', 'does', 'ours', 'y', 
                    're', 'herself', 'both', 'an', 'his', 'him', 'under', 'why']



# words to exclude from the original stopwords set


# calc to excluding the above words from original list
#new_stopwords = stopwords - exclude_words

# creating a list from train messages
messages_train = train['new message'].to_numpy()
# creating a list from test messages
messages_test = test['new message'].to_numpy()

# function to remove stopwords from messages
def remove_stopwords(messages):
    output_array=[]
    for sentence in messages:
        temp_list=[]
        for word in sentence.split():
            if word.lower() not in custom_stopwords:
                temp_list.append(word)
        output_array.append(' '.join(temp_list))
    return output_array

In [10]:
# applying stopword funtion to train set
train['new message'] = remove_stopwords(messages_train)
# applying stopword funtion to test set
test['new message'] = remove_stopwords(messages_test)

In [11]:
train.head()

Unnamed: 0,sentiment,message,tweetid,new message
0,1,polyscimajor epa chief doesn't think carbon di...,625221,poly sci major epa chief doesnt think carbon d...
1,1,it's not like we lack evidence of anthropogeni...,126103,not like lack evidence anthropogenic global wa...
2,2,rt researchers say we have three years to act...,698562,retweet researchers say three years act climat...
3,1,todayinmaker wired 2016 was a pivotal year...,573736,today maker wired 2016 pivotal year war climat...
4,1,"rt it's 2016, and a racist, sexist, climate c...",466954,retweet 2016 racist sexist climate change deny...


In [12]:
test.head()

Unnamed: 0,message,tweetid,new message
0,europe will now be looking to china to make su...,169760,europe looking china make sure not alone fight...
1,combine this with the polling of staffers re c...,35326,combine polling staffers climate change womens...
2,"the scary, unimpeachable evidence that climate...",224985,scary unimpeachable evidence climate change al...
3,\nputin got to you too jill ! \ntrump does...,476263,put got jill trump doesnt believe climate chan...
4,rt 'female orgasms cause global warming!'\n s...,872928,retweet female orgasms cause global warming sa...


# Most Frequent Words

In [13]:
most_freq = pd.Series(' '.join(train['new message']).split()).value_counts()[:10]
most_freq

climate    13467
change     12945
retweet     9715
global      3884
warming     3598
s           2866
trump       2167
believe     1161
not         1101
t            894
dtype: int64

# Less Frequent Words

In [14]:
less_freq = pd.Series(' '.join(train['new message']).split()).value_counts()[-10:]
less_freq

cem             1
hawked          1
malawi          1
ava             1
expects         1
downpours       1
experimented    1
deng            1
unholy          1
ezra            1
dtype: int64

## Tokens 

In [15]:
from nltk.tokenize import TreebankWordTokenizer, word_tokenize
tokeniser = TreebankWordTokenizer()

#tokenizing the train set
train['tokens'] = train['new message'].apply(tokeniser.tokenize)
#tokenizing the test set
test['tokens'] = test['new message'].apply(tokeniser.tokenize)

## Stemming

In [16]:
from nltk import SnowballStemmer, PorterStemmer, LancasterStemmer
snowball_stemmer = SnowballStemmer('english')
porter_stemmer = PorterStemmer()
lancaster_stemmer = LancasterStemmer()

# function for stemming
def dataset_stem(words, stemmer):
    return [snowball_stemmer.stem(word) for word in words]

# applying stemming to the train set
train['snowball_stemmer'] = train['tokens'].apply(dataset_stem, args=(snowball_stemmer,))
train['porter_stemmer'] = train['tokens'].apply(dataset_stem, args=(porter_stemmer,))
train['lancaster_stemmer'] = train['tokens'].apply(dataset_stem, args=(lancaster_stemmer,))

# applying stemming to the test set
test['snowball_stemmer'] = test['tokens'].apply(dataset_stem, args=(snowball_stemmer,))
test['porter_stemmer'] = test['tokens'].apply(dataset_stem, args=(porter_stemmer,))
test['lancaster_stemmer'] = test['tokens'].apply(dataset_stem, args=(lancaster_stemmer,))

# Lemmatization

In [17]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# Function for Lemmatizer
def dataset_lemma(words, lemmatizer):
    return [lemmatizer.lemmatize(word) for word in words]   

#Applying lemmatizer of train set
train['lemma'] = train['tokens'].apply(dataset_lemma, args=(lemmatizer, ))

#Applying lemmatizer of train set
test['lemma'] = test['tokens'].apply(dataset_lemma, args=(lemmatizer, ))

# Joining words in stemming and Lemma

In [18]:
# joining words for the snowball stemming on the train set
snowball_stem_sentence_train = []
for words in train['snowball_stemmer']:
    snowball_stem_sentence_train.append(' '.join(words))

# joining words for the porter stemming on the train set
porter_stem_sentence_train = []
for words in train['porter_stemmer']:
    porter_stem_sentence_train.append(' '.join(words))

# joining words for the lancaster stemming on the train set
lancaster_stem_sentence_train = []
for words in train['lancaster_stemmer']:
    lancaster_stem_sentence_train.append(' '.join(words))

# joining words for the lemma on the train set
lemma_sentence_train = []
for words in train['lemma']:
    lemma_sentence_train.append(' '.join(words))

# setting lists to the dataframe column
train['snowball_stemmer'] = snowball_stem_sentence_train
train['porter_stemmer'] = porter_stem_sentence_train
train['lancaster_stemmer'] = lancaster_stem_sentence_train
train['lemma'] = lemma_sentence_train
    

# joining words for the snowball stemming on the test set
snowball_stem_sentence_test = []
for words in test['snowball_stemmer']:
    snowball_stem_sentence_test.append(' '.join(words))

# joining words for the porter stemming on the test set
porter_stem_sentence_test = []
for words in test['porter_stemmer']:
    porter_stem_sentence_test.append(' '.join(words))

# joining words for the lancaster stemming on the test set
lancaster_stem_sentence_test = []
for words in test['lancaster_stemmer']:
    lancaster_stem_sentence_test.append(' '.join(words))

# joining words for the lemma on the test set
lemma_sentence_test = []
for words in test['lemma']:
    lemma_sentence_test.append(' '.join(words))

# setting lists to the dataframe column
test['snowball_stemmer'] = snowball_stem_sentence_test
test['porter_stemmer'] = porter_stem_sentence_test
test['lancaster_stemmer'] = lancaster_stem_sentence_test
test['lemma'] = lemma_sentence_test


In [19]:
train.head()

Unnamed: 0,sentiment,message,tweetid,new message,tokens,snowball_stemmer,porter_stemmer,lancaster_stemmer,lemma
0,1,polyscimajor epa chief doesn't think carbon di...,625221,poly sci major epa chief doesnt think carbon d...,"[poly, sci, major, epa, chief, doesnt, think, ...",poli sci major epa chief doesnt think carbon d...,poli sci major epa chief doesnt think carbon d...,poli sci major epa chief doesnt think carbon d...,poly sci major epa chief doesnt think carbon d...
1,1,it's not like we lack evidence of anthropogeni...,126103,not like lack evidence anthropogenic global wa...,"[not, like, lack, evidence, anthropogenic, glo...",not like lack evid anthropogen global warm,not like lack evid anthropogen global warm,not like lack evid anthropogen global warm,not like lack evidence anthropogenic global wa...
2,2,rt researchers say we have three years to act...,698562,retweet researchers say three years act climat...,"[retweet, researchers, say, three, years, act,...",retweet research say three year act climat cha...,retweet research say three year act climat cha...,retweet research say three year act climat cha...,retweet researcher say three year act climate ...
3,1,todayinmaker wired 2016 was a pivotal year...,573736,today maker wired 2016 pivotal year war climat...,"[today, maker, wired, 2016, pivotal, year, war...",today maker wire 2016 pivot year war climat chang,today maker wire 2016 pivot year war climat chang,today maker wire 2016 pivot year war climat chang,today maker wired 2016 pivotal year war climat...
4,1,"rt it's 2016, and a racist, sexist, climate c...",466954,retweet 2016 racist sexist climate change deny...,"[retweet, 2016, racist, sexist, climate, chang...",retweet 2016 racist sexist climat chang deni b...,retweet 2016 racist sexist climat chang deni b...,retweet 2016 racist sexist climat chang deni b...,retweet 2016 racist sexist climate change deny...


In [48]:
test.head()

Unnamed: 0,message,tweetid,new message,tokens,snowball_stemmer,porter_stemmer,lancaster_stemmer,lemma
0,europe will now be looking to china to make su...,169760,europe looking china make sure not alone fight...,"[europe, looking, china, make, sure, not, alon...",europ look china make sure not alon fight clim...,europ look china make sure not alon fight clim...,europ look china make sure not alon fight clim...,europe looking china make sure not alone fight...
1,combine this with the polling of staffers re c...,35326,combine polling staffers climate change womens...,"[combine, polling, staffers, climate, change, ...",combin poll staffer climat chang women right f...,combin poll staffer climat chang women right f...,combin poll staffer climat chang women right f...,combine polling staffer climate change woman r...
2,"the scary, unimpeachable evidence that climate...",224985,scary unimpeachable evidence climate change al...,"[scary, unimpeachable, evidence, climate, chan...",scari unimpeach evid climat chang alreadi time...,scari unimpeach evid climat chang alreadi time...,scari unimpeach evid climat chang alreadi time...,scary unimpeachable evidence climate change al...
3,\nputin got to you too jill ! \ntrump does...,476263,put got jill trump doesnt believe climate chan...,"[put, got, jill, trump, doesnt, believe, clima...",put got jill trump doesnt believ climat chang ...,put got jill trump doesnt believ climat chang ...,put got jill trump doesnt believ climat chang ...,put got jill trump doesnt believe climate chan...
4,rt 'female orgasms cause global warming!'\n s...,872928,retweet female orgasms cause global warming sa...,"[retweet, female, orgasms, cause, global, warm...",retweet femal orgasm caus global warm sarcast ...,retweet femal orgasm caus global warm sarcast ...,retweet femal orgasm caus global warm sarcast ...,retweet female orgasm cause global warming sar...


In [22]:
from textblob import TextBlob

polarity = []
subjectivity = []
for sentence in train['lemma']:
    polarity.append((TextBlob(sentence).sentiment.polarity))
    subjectivity.append((TextBlob(sentence).sentiment.subjectivity))
    
train['polarity'] = polarity
train['subjectivity'] = subjectivity

In [23]:
train.head()

Unnamed: 0,sentiment,message,tweetid,new message,tokens,snowball_stemmer,porter_stemmer,lancaster_stemmer,lemma,polarity,subjectivity
0,1,polyscimajor epa chief doesn't think carbon di...,625221,poly sci major epa chief doesnt think carbon d...,"[poly, sci, major, epa, chief, doesnt, think, ...",poli sci major epa chief doesnt think carbon d...,poli sci major epa chief doesnt think carbon d...,poli sci major epa chief doesnt think carbon d...,poly sci major epa chief doesnt think carbon d...,0.076389,0.277778
1,1,it's not like we lack evidence of anthropogeni...,126103,not like lack evidence anthropogenic global wa...,"[not, like, lack, evidence, anthropogenic, glo...",not like lack evid anthropogen global warm,not like lack evid anthropogen global warm,not like lack evid anthropogen global warm,not like lack evidence anthropogenic global wa...,0.0,0.0
2,2,rt researchers say we have three years to act...,698562,retweet researchers say three years act climat...,"[retweet, researchers, say, three, years, act,...",retweet research say three year act climat cha...,retweet research say three year act climat cha...,retweet research say three year act climat cha...,retweet researcher say three year act climate ...,-0.3,0.6
3,1,todayinmaker wired 2016 was a pivotal year...,573736,today maker wired 2016 pivotal year war climat...,"[today, maker, wired, 2016, pivotal, year, war...",today maker wire 2016 pivot year war climat chang,today maker wire 2016 pivot year war climat chang,today maker wire 2016 pivot year war climat chang,today maker wired 2016 pivotal year war climat...,0.5,0.8
4,1,"rt it's 2016, and a racist, sexist, climate c...",466954,retweet 2016 racist sexist climate change deny...,"[retweet, 2016, racist, sexist, climate, chang...",retweet 2016 racist sexist climat chang deni b...,retweet 2016 racist sexist climat chang deni b...,retweet 2016 racist sexist climat chang deni b...,retweet 2016 racist sexist climate change deny...,0.0,0.0


## Splitting out the X variable from the target

In [27]:
y = train['sentiment']
X = train['lemma']

## Turning text into something your model can read

In [28]:
vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=2)
X_vectorized = vectorizer.fit_transform(X)

## Splitting the training data into a training and validation set

In [41]:
X_train,X_val,y_train,y_val = train_test_split(X_vectorized,y,test_size=0.1, random_state=18)

## Creating a function to measure best model

In [43]:
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score

def model_selection(model, X_train, X_val, y_train, y_val):
    model.fit(X_train,y_train)
    y_pred = model.predict(X_val)
    return print(f1_score(y_val, y_pred, average='macro'))

# Linear SVC

In [44]:
rfc = LinearSVC(dual= True,fit_intercept= False,loss= 'hinge',multi_class= 'ovr',penalty= 'l2')
model_selection(rfc, X_train, X_val, y_train, y_val)

#0.6627376887718992

0.6869158097766954


# Logistic Regression

In [45]:
model_selection(LogisticRegression(), X_train, X_val, y_train, y_val)

#0.619999147722196

0.6268069636564937


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


# XGBoost

In [46]:
xgb = XGBClassifier()
model_selection(XGBClassifier(), X_train, X_val, y_train, y_val)

0.6028702705628369


# Using the best model

In [82]:
from sklearn.model_selection import cross_val_score
rfc = LinearSVC(dual= True,fit_intercept= False,loss= 'hinge',multi_class= 'ovr',penalty= 'l2')
rfc.fit(X_vectorized, y)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=False,
          intercept_scaling=1, loss='hinge', max_iter=1000, multi_class='ovr',
          penalty='l2', random_state=None, tol=0.0001, verbose=0)

In [83]:
test.head()

Unnamed: 0,message,tweetid,tokens,snowball_stemmer,porter_stemmer,lancaster_stemmer,lemma
0,europe looking china make sure fighting climat...,169760,"[europe, looking, china, make, sure, fighting,...",europ look china make sure fight climat chang,europ look china make sure fight climat chang,europ look china make sure fight climat chang,europe looking china make sure fighting climat...
1,combine polling staffers climate change womens...,35326,"[combine, polling, staffers, climate, change, ...",combin poll staffer climat chang women right f...,combin poll staffer climat chang women right f...,combin poll staffer climat chang women right f...,combine polling staffer climate change woman r...
2,scary unimpeachable evidence climate change ti...,224985,"[scary, unimpeachable, evidence, climate, chan...",scari unimpeach evid climat chang time chang c...,scari unimpeach evid climat chang time chang c...,scari unimpeach evid climat chang time chang c...,scary unimpeachable evidence climate change ti...
3,got jill trump doesnt believe climate change t...,476263,"[got, jill, trump, doesnt, believe, climate, c...",got jill trump doesnt believ climat chang thin...,got jill trump doesnt believ climat chang thin...,got jill trump doesnt believ climat chang thin...,got jill trump doesnt believe climate change t...
4,retweet female orgasms cause global warming sa...,872928,"[retweet, female, orgasms, cause, global, warm...",retweet femal orgasm caus global warm sarcast ...,retweet femal orgasm caus global warm sarcast ...,retweet femal orgasm caus global warm sarcast ...,retweet female orgasm cause global warming sar...


## Getting our test set ready 

In [84]:
testx = test['lemma']
test_vect = vectorizer.transform(testx)

## Making predictions on the test set and adding a sentiment column to our original test df

In [85]:
y_pred = rfc.predict(test_vect)

In [86]:
test['sentiment'] = y_pred

## Creating an output csv for submission

In [87]:
test[['tweetid','sentiment']].to_csv('testsubmission.csv', index=False)