# Importing the Libaries 

In [1]:
import pandas as pd  
import numpy as np
import matplotlib.pyplot as plt
from textblob import TextBlob

# Importing the training dataset

In [2]:
train_df = pd.read_csv('Train.csv', usecols=['message', 'sentiment'])
train_df.head()

Unnamed: 0,sentiment,message
0,1,PolySciMajor EPA chief doesn't think carbon di...
1,1,It's not like we lack evidence of anthropogeni...
2,2,RT @RawStory: Researchers say we have three ye...
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year...
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ..."


# Counting number of messages in each sentiment

In [3]:
train_df.sentiment.value_counts()

 1    8530
 2    3640
 0    2353
-1    1296
Name: sentiment, dtype: int64

# Cleaning the Data, removing links and @Mentions and Special Characters

In [4]:
def standardize_text(df, text_field):
    df[text_field] = df[text_field].str.replace(r"http\S+", "")
    df[text_field] = df[text_field].str.replace(r"http", "")
    df[text_field] = df[text_field].str.replace(r"@\S+", "")
    df[text_field] = df[text_field].str.replace(r"[^A-Za-z0-9(),!?@\'\`\"\_\n]", " ")
    df[text_field] = df[text_field].str.replace(r"@", "")
    df[text_field] = df[text_field].str.replace(r"RT "," ")
    df[text_field] = df[text_field].str.lower()
    return df

In [5]:
clean_train_df = standardize_text(train_df, 'message')
clean_train_df.head()

Unnamed: 0,sentiment,message
0,1,polyscimajor epa chief doesn't think carbon di...
1,1,it's not like we lack evidence of anthropogeni...
2,2,researchers say we have three years to act o...
3,1,todayinmaker wired 2016 was a pivotal year...
4,1,"it's 2016, and a racist, sexist, climate cha..."


# Removing Punctuation

In [6]:
import string

def remove_punctuation(post):
    return ''.join([l for l in post if l not in string.punctuation])

In [7]:
clean_train_df['message'] = clean_train_df['message'].apply(remove_punctuation)
clean_train_df.head()

Unnamed: 0,sentiment,message
0,1,polyscimajor epa chief doesnt think carbon dio...
1,1,its not like we lack evidence of anthropogenic...
2,2,researchers say we have three years to act o...
3,1,todayinmaker wired 2016 was a pivotal year...
4,1,its 2016 and a racist sexist climate change ...


# Removing stopwords

In [8]:
from nltk.corpus import stopwords
stopwords=set(stopwords.words('english'))
exclude_words = set(("couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 
                     'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 
                     'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan',
                     "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 
                     'won', "won't", 'wouldn', "wouldn't", 'not', "aren't", "don't"))

new_stopwords = stopwords - exclude_words

messages = clean_train_df['message'].to_numpy()

def remove_stopwords(messages):
    output_array=[]
    for sentence in messages:
        temp_list=[]
        for word in sentence.split():
            if word.lower() not in new_stopwords:
                temp_list.append(word)
        output_array.append(' '.join(temp_list))
    return output_array

In [9]:
clean_train_df['message'] = remove_stopwords(messages)
clean_train_df[:10]

Unnamed: 0,sentiment,message
0,1,polyscimajor epa chief doesnt think carbon dio...
1,1,not like lack evidence anthropogenic global wa...
2,2,researchers say three years act climate change...
3,1,todayinmaker wired 2016 pivotal year war clima...
4,1,2016 racist sexist climate change denying bigo...
5,1,worth read whether dont believe climate change
6,1,mike pence doesn believe global warming smokin...
7,1,six big things today fight climate change clim...
8,1,8yo nephew inconsolable wants die old age like...
9,1,offense like not believe global warming


# Tokenisation

In [10]:
from nltk.tokenize import TreebankWordTokenizer, word_tokenize

tokeniser = TreebankWordTokenizer()
clean_train_df['tokens'] = clean_train_df['message'].apply(tokeniser.tokenize)

In [11]:
clean_train_df.head()

Unnamed: 0,sentiment,message,tokens
0,1,polyscimajor epa chief doesnt think carbon dio...,"[polyscimajor, epa, chief, doesnt, think, carb..."
1,1,not like lack evidence anthropogenic global wa...,"[not, like, lack, evidence, anthropogenic, glo..."
2,2,researchers say three years act climate change...,"[researchers, say, three, years, act, climate,..."
3,1,todayinmaker wired 2016 pivotal year war clima...,"[todayinmaker, wired, 2016, pivotal, year, war..."
4,1,2016 racist sexist climate change denying bigo...,"[2016, racist, sexist, climate, change, denyin..."


# Stemming

In [26]:
from nltk import SnowballStemmer, PorterStemmer, LancasterStemmer
stemmer = SnowballStemmer('english')

def dataset_stem(words, stemmer):
    return [stemmer.stem(word) for word in words]

clean_train_df['stem'] = clean_train_df['tokens'].apply(dataset_stem, args=(stemmer,))
clean_train_df.head()

Unnamed: 0,sentiment,message,tokens,stem,lemma
0,1,polyscimajor epa chief doesnt think carbon dio...,"[polyscimajor, epa, chief, doesnt, think, carb...","[polyscimajor, epa, chief, doesnt, think, carb...",polyscimajor epa chief doesnt think carbon dio...
1,1,not like lack evidence anthropogenic global wa...,"[not, like, lack, evidence, anthropogenic, glo...","[not, like, lack, evid, anthropogen, global, w...",not like lack evidence anthropogenic global wa...
2,2,researchers say three years act climate change...,"[researchers, say, three, years, act, climate,...","[research, say, three, year, act, climat, chan...",researcher say three year act climate change late
3,1,todayinmaker wired 2016 pivotal year war clima...,"[todayinmaker, wired, 2016, pivotal, year, war...","[todayinmak, wire, 2016, pivot, year, war, cli...",todayinmaker wired 2016 pivotal year war clima...
4,1,2016 racist sexist climate change denying bigo...,"[2016, racist, sexist, climate, change, denyin...","[2016, racist, sexist, climat, chang, deni, bi...",2016 racist sexist climate change denying bigo...


Unnamed: 0,sentiment,message,tokens,stem
0,1,polyscimajor epa chief doesnt think carbon dio...,"[polyscimajor, epa, chief, doesnt, think, carb...","[polyscimajor, epa, chief, doesnt, think, carb..."
1,1,not like lack evidence anthropogenic global wa...,"[not, like, lack, evidence, anthropogenic, glo...","[not, like, lack, evid, anthropogen, global, w..."
2,2,researchers say three years act climate change...,"[researchers, say, three, years, act, climate,...","[research, say, three, year, act, climat, chan..."
3,1,todayinmaker wired 2016 pivotal year war clima...,"[todayinmaker, wired, 2016, pivotal, year, war...","[todayinmak, wire, 2016, pivot, year, war, cli..."
4,1,2016 racist sexist climate change denying bigo...,"[2016, racist, sexist, climate, change, denyin...","[2016, racist, sexist, climat, chang, deni, bi..."


# Lemmatization

In [27]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def dataset_lemma(words, lemmatizer):
    return [lemmatizer.lemmatize(word) for word in words]   

clean_train_df['lemma'] = clean_train_df['tokens'].apply(dataset_lemma, args=(lemmatizer, ))
clean_train_df.head()

Unnamed: 0,sentiment,message,tokens,stem,lemma
0,1,polyscimajor epa chief doesnt think carbon dio...,"[polyscimajor, epa, chief, doesnt, think, carb...","[polyscimajor, epa, chief, doesnt, think, carb...","[polyscimajor, epa, chief, doesnt, think, carb..."
1,1,not like lack evidence anthropogenic global wa...,"[not, like, lack, evidence, anthropogenic, glo...","[not, like, lack, evid, anthropogen, global, w...","[not, like, lack, evidence, anthropogenic, glo..."
2,2,researchers say three years act climate change...,"[researchers, say, three, years, act, climate,...","[research, say, three, year, act, climat, chan...","[researcher, say, three, year, act, climate, c..."
3,1,todayinmaker wired 2016 pivotal year war clima...,"[todayinmaker, wired, 2016, pivotal, year, war...","[todayinmak, wire, 2016, pivot, year, war, cli...","[todayinmaker, wired, 2016, pivotal, year, war..."
4,1,2016 racist sexist climate change denying bigo...,"[2016, racist, sexist, climate, change, denyin...","[2016, racist, sexist, climat, chang, deni, bi...","[2016, racist, sexist, climate, change, denyin..."


In [28]:
lemma_sentence = []
for words in clean_train_df['lemma']:
    lemma_sentence.append(' '.join(words))

stem_sentence = []
for words in clean_train_df['stem']:
    stem_sentence.append(' '.join(words))
    
clean_train_df['lemma'] = lemma_sentence
clean_train_df['stem'] = stem_sentence

clean_train_df.head()

Unnamed: 0,sentiment,message,tokens,stem,lemma
0,1,polyscimajor epa chief doesnt think carbon dio...,"[polyscimajor, epa, chief, doesnt, think, carb...",polyscimajor epa chief doesnt think carbon dio...,polyscimajor epa chief doesnt think carbon dio...
1,1,not like lack evidence anthropogenic global wa...,"[not, like, lack, evidence, anthropogenic, glo...",not like lack evid anthropogen global warm,not like lack evidence anthropogenic global wa...
2,2,researchers say three years act climate change...,"[researchers, say, three, years, act, climate,...",research say three year act climat chang late,researcher say three year act climate change late
3,1,todayinmaker wired 2016 pivotal year war clima...,"[todayinmaker, wired, 2016, pivotal, year, war...",todayinmak wire 2016 pivot year war climat chang,todayinmaker wired 2016 pivotal year war clima...
4,1,2016 racist sexist climate change denying bigo...,"[2016, racist, sexist, climate, change, denyin...",2016 racist sexist climat chang deni bigot lea...,2016 racist sexist climate change denying bigo...


# Bag of words

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=2)
    
X = vectorizer.fit_transform(clean_train_df['lemma']).toarray()
y = clean_train_df['sentiment']

In [22]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 6)

# Choosing a model

In [23]:
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score

def model_selection(model, X_train, X_val, y_train, y_val):
    model.fit(X_train,y_train)
    y_pred = model.predict(X_val)
    return print(f1_score(y_val, y_pred, average='macro'))

# Logistic Regression

In [24]:
from sklearn.linear_model import LogisticRegression

model_selection(LogisticRegression(),X_train, X_val, y_train, y_val)

0.5814861303004495


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


# KNeighbors

In [25]:
from sklearn.neighbors import KNeighborsClassifier

model_selection(KNeighborsClassifier(),X_train, X_val, y_train, y_val)

0.5417303670231479


# Support Vector Classifer

In [None]:
from sklearn.svm import SVC

model_selection(SVC(kernel='linear'),X_train, X_val, y_train, y_val)

# Decision Tree

In [45]:
from sklearn.tree import DecisionTreeClassifier

model_selection(DecisionTreeClassifier(),X_train, X_val, y_train, y_val)

0.54643951931924


# Random Forest

In [46]:
from sklearn.ensemble import RandomForestClassifier 

model_selection(RandomForestClassifier(n_estimators=60),X_train, X_val, y_train, y_val)

0.5816196761349723


# Naive Bayes

In [47]:
from sklearn.naive_bayes import MultinomialNB 

model_selection(MultinomialNB(),X_train, X_val, y_train, y_val)

0.6087586339601438


# Applying best model

In [48]:
sc = SVC(kernel='linear')
sc.fit(X, y)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

# Importing the test dataset

In [49]:
test_df = pd.read_csv('test.csv')
tweet_ids = test_df['tweetid']
test_df.head()

Unnamed: 0,message,tweetid
0,Europe will now be looking to China to make su...,169760
1,Combine this with the polling of staffers re c...,35326
2,"The scary, unimpeachable evidence that climate...",224985
3,@Karoli @morgfair @OsborneInk @dailykos \nPuti...,476263
4,RT @FakeWillMoore: 'Female orgasms cause globa...,872928


# Cleaning the test dataset

In [50]:
clean_test_df = standardize_text(test_df, 'message')
clean_test_df.head()

Unnamed: 0,message,tweetid
0,europe will now be looking to china to make su...,169760
1,combine this with the polling of staffers re c...,35326
2,"the scary, unimpeachable evidence that climate...",224985
3,\nputin got to you too jill ! \ntrump does...,476263
4,'female orgasms cause global warming!'\n sar...,872928


# Removing Punc

In [51]:
clean_test_df['message'] = clean_test_df['message'].apply(remove_punctuation)
clean_test_df.head()

Unnamed: 0,message,tweetid
0,europe will now be looking to china to make su...,169760
1,combine this with the polling of staffers re c...,35326
2,the scary unimpeachable evidence that climate ...,224985
3,\nputin got to you too jill \ntrump doesn...,476263
4,female orgasms cause global warming\n sarcas...,872928


# Removing stopwords

In [52]:
clean_test_df['message'] = remove_stopwords(clean_test_df['message'])
clean_test_df.head(10)

Unnamed: 0,message,tweetid
0,europe looking china make sure not alone fight...,169760
1,combine polling staffers climate change womens...,35326
2,scary unimpeachable evidence climate change al...,224985
3,putin got jill trump doesnt believe climate ch...,476263
4,female orgasms cause global warming sarcastic ...,872928
5,trump muzzles employees several gov agencies e...,75639
6,yes wrote 3rd yr comp sci ethics part told cli...,211536
7,indonesian farmers weather climate change w co...,569434
8,british scientists face huge hit us cuts clima...,315368
9,aid agriculture sustainable agriculture climat...,591733


# Tokenization

In [53]:
clean_test_df['tokens'] = clean_test_df['message'].apply(tokeniser.tokenize)
clean_test_df

Unnamed: 0,message,tweetid,tokens
0,europe looking china make sure not alone fight...,169760,"[europe, looking, china, make, sure, not, alon..."
1,combine polling staffers climate change womens...,35326,"[combine, polling, staffers, climate, change, ..."
2,scary unimpeachable evidence climate change al...,224985,"[scary, unimpeachable, evidence, climate, chan..."
3,putin got jill trump doesnt believe climate ch...,476263,"[putin, got, jill, trump, doesnt, believe, cli..."
4,female orgasms cause global warming sarcastic ...,872928,"[female, orgasms, cause, global, warming, sarc..."
...,...,...,...
10541,brb writing poem climate change climatechange ...,895714,"[brb, writing, poem, climate, change, climatec..."
10542,2016 year climate change came home hottest yea...,875167,"[2016, year, climate, change, came, home, hott..."
10543,pacific countries positive fiji leading global...,78329,"[pacific, countries, positive, fiji, leading, ..."
10544,hot must cause global warming aldublaboroflove,867455,"[hot, must, cause, global, warming, aldublabor..."


# Stemming

In [54]:
clean_test_df['stem'] = clean_test_df['tokens'].apply(dataset_stem, args=(stemmer,))
clean_test_df.head()

Unnamed: 0,message,tweetid,tokens,stem
0,europe looking china make sure not alone fight...,169760,"[europe, looking, china, make, sure, not, alon...","[europ, look, china, make, sure, not, alon, fi..."
1,combine polling staffers climate change womens...,35326,"[combine, polling, staffers, climate, change, ...","[combin, poll, staffer, climat, chang, women, ..."
2,scary unimpeachable evidence climate change al...,224985,"[scary, unimpeachable, evidence, climate, chan...","[scari, unimpeach, evid, climat, chang, alread..."
3,putin got jill trump doesnt believe climate ch...,476263,"[putin, got, jill, trump, doesnt, believe, cli...","[putin, got, jill, trump, doesnt, believ, clim..."
4,female orgasms cause global warming sarcastic ...,872928,"[female, orgasms, cause, global, warming, sarc...","[femal, orgasm, caus, global, warm, sarcast, r..."


# Lemma

In [55]:
clean_test_df['lemma'] = clean_test_df['tokens'].apply(dataset_lemma, args=(lemmatizer, ))
clean_test_df.head()

Unnamed: 0,message,tweetid,tokens,stem,lemma
0,europe looking china make sure not alone fight...,169760,"[europe, looking, china, make, sure, not, alon...","[europ, look, china, make, sure, not, alon, fi...","[europe, looking, china, make, sure, not, alon..."
1,combine polling staffers climate change womens...,35326,"[combine, polling, staffers, climate, change, ...","[combin, poll, staffer, climat, chang, women, ...","[combine, polling, staffer, climate, change, w..."
2,scary unimpeachable evidence climate change al...,224985,"[scary, unimpeachable, evidence, climate, chan...","[scari, unimpeach, evid, climat, chang, alread...","[scary, unimpeachable, evidence, climate, chan..."
3,putin got jill trump doesnt believe climate ch...,476263,"[putin, got, jill, trump, doesnt, believe, cli...","[putin, got, jill, trump, doesnt, believ, clim...","[putin, got, jill, trump, doesnt, believe, cli..."
4,female orgasms cause global warming sarcastic ...,872928,"[female, orgasms, cause, global, warming, sarc...","[femal, orgasm, caus, global, warm, sarcast, r...","[female, orgasm, cause, global, warming, sarca..."


# Bag of words

In [66]:
test_messages = []
cv = CountVectorizer(max_features=5000)
for words in clean_test_df['lemma']:
    test_messages.append(' '.join(words))

In [69]:
X = cv.fit_transform(test_messages).toarray()

In [68]:
y_pred = sc.predict(X)

In [58]:
final = pd.DataFrame({'tweetid': tweet_ids, 'sentiment': y_pred})

In [59]:
final.to_csv('final_prediction.csv', index=False)

In [60]:
final.sentiment.value_counts()

 0    4982
 1    2727
 2    1515
-1    1322
Name: sentiment, dtype: int64