# Advanced Regression Predict

## Introduction

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier


# Accuracy Scores
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn import metrics

# For visualization
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as ms
%matplotlib inline

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
#print(os.listdir("../input"))

# Data Cleaning
import emoji
from bs4 import BeautifulSoup
import re
import itertools
import string

# Any results you write to the current directory are saved as output.

Loading the data.

In [2]:
train = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [3]:
train_data = train.copy()

## Preprocessing

In [4]:
train_data.shape, test_data.shape

((15819, 3), (10546, 2))

In [5]:
train_data.head()

Unnamed: 0,sentiment,message,tweetid
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221
1,1,It's not like we lack evidence of anthropogeni...,126103
2,2,RT @RawStory: Researchers say we have three ye...,698562
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year...,573736
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",466954


In [6]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15819 entries, 0 to 15818
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sentiment  15819 non-null  int64 
 1   message    15819 non-null  object
 2   tweetid    15819 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 370.9+ KB


In [7]:
train_data['message'].value_counts()


RT @StephenSchlegel: she's thinking about how she's going to die because your husband doesn't believe in climate change https://t.co/SjoFoNÃ¢â‚¬Â¦    307
RT @SenSanders: We have a president-elect who doesn't believe in climate change. Millions of people are going to have to say: Mr. TÃ¢â‚¬Â¦            130
RT @NatGeoChannel: Watch #BeforeTheFlood right here, as @LeoDiCaprio travels the world to tackle climate change https://t.co/LkDehj3tNn httÃ¢â‚¬Â¦     73
RT @BernieSanders: #ImVotingBecause the future of the planet is at stake. Hillary Clinton will combat climate change. Donald Trump thinks iÃ¢â‚¬Â¦     59
RT @SethMacFarlane: HRC proposes installing half a billion solar panels by the end of her first term. Trump thinks climate change is a hoaxÃ¢â‚¬Â¦     56
                                                                                                                                                     ... 
is this an article about global warming or a coupon for tide? either way the

In [8]:
test_data.head()

Unnamed: 0,message,tweetid
0,Europe will now be looking to China to make su...,169760
1,Combine this with the polling of staffers re c...,35326
2,"The scary, unimpeachable evidence that climate...",224985
3,@Karoli @morgfair @OsborneInk @dailykos \nPuti...,476263
4,RT @FakeWillMoore: 'Female orgasms cause globa...,872928


### Text Cleaning

In [9]:
#cleaning the data

def drop_features(features,data):
    data.drop(features,inplace=True,axis=1)

In [10]:
import re
## example ## 
re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])", " ","ouch...junior is angryð#got7 #junior #yugyo..., @user")

'ouch   junior is angry     got7  junior  yugyo      '

In [11]:
def load_dict_smileys():
    
    return {
        ":‑)":"smiley",
        ":-]":"smiley",
        ":-3":"smiley",
        ":->":"smiley",
        "8-)":"smiley",
        ":-}":"smiley",
        ":)":"smiley",
        ":]":"smiley",
        ":3":"smiley",
        ":>":"smiley",
        "8)":"smiley",
        ":}":"smiley",
        ":o)":"smiley",
        ":c)":"smiley",
        ":^)":"smiley",
        "=]":"smiley",
        "=)":"smiley",
        ":-))":"smiley",
        ":‑D":"smiley",
        "8‑D":"smiley",
        "x‑D":"smiley",
        "X‑D":"smiley",
        ":D":"smiley",
        "8D":"smiley",
        "xD":"smiley",
        "XD":"smiley",
        ":‑(":"sad",
        ":‑c":"sad",
        ":‑<":"sad",
        ":‑[":"sad",
        ":(":"sad",
        ":c":"sad",
        ":<":"sad",
        ":[":"sad",
        ":-||":"sad",
        ">:[":"sad",
        ":{":"sad",
        ":@":"sad",
        ">:(":"sad",
        ":'‑(":"sad",
        ":'(":"sad",
        ":‑P":"playful",
        "X‑P":"playful",
        "x‑p":"playful",
        ":‑p":"playful",
        ":‑Þ":"playful",
        ":‑þ":"playful",
        ":‑b":"playful",
        ":P":"playful",
        "XP":"playful",
        "xp":"playful",
        ":p":"playful",
        ":Þ":"playful",
        ":þ":"playful",
        ":b":"playful",
        "<3":"love"
        }


In [12]:
def load_dict_contractions():
    
    return {
        "ain't":"is not",
        "amn't":"am not",
        "aren't":"are not",
        "can't":"cannot",
        "'cause":"because",
        "couldn't":"could not",
        "couldn't've":"could not have",
        "could've":"could have",
        "daren't":"dare not",
        "daresn't":"dare not",
        "dasn't":"dare not",
        "didn't":"did not",
        "doesn't":"does not",
        "don't":"do not",
        "e'er":"ever",
        "em":"them",
        "everyone's":"everyone is",
        "finna":"fixing to",
        "gimme":"give me",
        "gonna":"going to",
        "gon't":"go not",
        "gotta":"got to",
        "hadn't":"had not",
        "hasn't":"has not",
        "haven't":"have not",
        "he'd":"he would",
        "he'll":"he will",
        "he's":"he is",
        "he've":"he have",
        "how'd":"how would",
        "how'll":"how will",
        "how're":"how are",
        "how's":"how is",
        "I'd":"I would",
        "I'll":"I will",
        "I'm":"I am",
        "I'm'a":"I am about to",
        "I'm'o":"I am going to",
        "isn't":"is not",
        "it'd":"it would",
        "it'll":"it will",
        "it's":"it is",
        "I've":"I have",
        "kinda":"kind of",
        "let's":"let us",
        "mayn't":"may not",
        "may've":"may have",
        "mightn't":"might not",
        "might've":"might have",
        "mustn't":"must not",
        "mustn't've":"must not have",
        "must've":"must have",
        "needn't":"need not",
        "ne'er":"never",
        "o'":"of",
        "o'er":"over",
        "ol'":"old",
        "oughtn't":"ought not",
        "shalln't":"shall not",
        "shan't":"shall not",
        "she'd":"she would",
        "she'll":"she will",
        "she's":"she is",
        "shouldn't":"should not",
        "shouldn't've":"should not have",
        "should've":"should have",
        "somebody's":"somebody is",
        "someone's":"someone is",
        "something's":"something is",
        "that'd":"that would",
        "that'll":"that will",
        "that're":"that are",
        "that's":"that is",
        "there'd":"there would",
        "there'll":"there will",
        "there're":"there are",
        "there's":"there is",
        "these're":"these are",
        "they'd":"they would",
        "they'll":"they will",
        "they're":"they are",
        "they've":"they have",
        "this's":"this is",
        "those're":"those are",
        "'tis":"it is",
        "'twas":"it was",
        "wanna":"want to",
        "wasn't":"was not",
        "we'd":"we would",
        "we'd've":"we would have",
        "we'll":"we will",
        "we're":"we are",
        "weren't":"were not",
        "we've":"we have",
        "what'd":"what did",
        "what'll":"what will",
        "what're":"what are",
        "what's":"what is",
        "what've":"what have",
        "when's":"when is",
        "where'd":"where did",
        "where're":"where are",
        "where's":"where is",
        "where've":"where have",
        "which's":"which is",
        "who'd":"who would",
        "who'd've":"who would have",
        "who'll":"who will",
        "who're":"who are",
        "who's":"who is",
        "who've":"who have",
        "why'd":"why did",
        "why're":"why are",
        "why's":"why is",
        "won't":"will not",
        "wouldn't":"would not",
        "would've":"would have",
        "y'all":"you all",
        "you'd":"you would",
        "you'll":"you will",
        "you're":"you are",
        "you've":"you have",
        "Whatcha":"What are you",
        "luv":"love",
        "sux":"sucks"
        }

In [13]:
import string
print ('Cleaning punctuation...')
def remove_punctuation_numbers(post):
    punc_numbers = string.punctuation + '0123456789'
    return ''.join([l for l in post if l not in punc_numbers])

Cleaning punctuation...


In [14]:
def strip_accents(text):
    if 'ø' in text or  'Ø' in text:
        #Do nothing when finding ø 
        return text   
    text = text.encode('ascii', 'ignore')
    text = text.decode("utf-8")
    return str(text)

def tweet_cleaning_for_sentiment_analysis(tweet):    
    
    #Lower case
    tweet = tweet.lower()
    #CONTRACTIONS source: https://en.wikipedia.org/wiki/Contraction_%28grammar%29
    #CONTRACTIONS = load_dict_contractions()
    #tweet = tweet.replace("’","'")
    #words = tweet.split()
    #reformed = [CONTRACTIONS[word] if word in CONTRACTIONS else word for word in words]
    #tweet = " ".join(reformed)
    #Escaping HTML characters
    tweet = BeautifulSoup(tweet).get_text()
    #Special case not handled previously.
    tweet = tweet.replace('\x92',"'")
    #Removal of hastags/tags
    tweet = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])", " ", tweet).split())
    #Removal of address
    tweet = ' '.join(re.sub("(\w+:\/\/\S+)", " ", tweet).split())
    #Removal of Punctuation
    tweet = remove_punctuation_numbers(tweet)
    # Standardizing words
    tweet = ''.join(''.join(s)[:2] for _, s in itertools.groupby(tweet))
    #Deal with smileys
    #source: https://en.wikipedia.org/wiki/List_of_emoticons
    #SMILEY = load_dict_smileys()  
    #words = tweet.split()
    #reformed = [SMILEY[word] if word in SMILEY else word for word in words]
    #tweet = " ".join(reformed)
    #Deal with emojis
    #tweet = emoji.demojize(tweet)
    #Strip accents
    #tweet= strip_accents(tweet)
    #tweet = tweet.replace(":"," ")
    #tweet = ' '.join(tweet.split())
    return tweet

In [15]:
train_data['processed_tweets'] = train_data['message'].apply(tweet_cleaning_for_sentiment_analysis)

In [16]:
train_data.head(10)

Unnamed: 0,sentiment,message,tweetid,processed_tweets
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221,polyscimajor epa chief doesn t think carbon di...
1,1,It's not like we lack evidence of anthropogeni...,126103,it s not like we lack evidence of anthropogeni...
2,2,RT @RawStory: Researchers say we have three ye...,698562,rt researchers say we have three years to act ...
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year...,573736,todayinmaker wired was a pivotal year in the ...
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",466954,rt it s and a racist sexist climate change de...
5,1,Worth a read whether you do or don't believe i...,425577,worth a read whether you do or don t believe i...
6,1,RT @thenation: Mike Pence doesn’t believe in g...,294933,rt mike pence doesn t believe in global warmin...
7,1,RT @makeandmendlife: Six big things we can ALL...,992717,rt six big things we can all do today to fight...
8,1,@AceofSpadesHQ My 8yo nephew is inconsolable. ...,664510,my yo nephew is inconsolable he wants to die o...
9,1,RT @paigetweedy: no offense… but like… how do ...,260471,rt no offense but like how do you just not bel...


In [17]:
drop_features(['tweetid','message'],train_data)

In [18]:
train_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15819 entries, 0 to 15818
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   sentiment         15819 non-null  int64 
 1   processed_tweets  15819 non-null  object
dtypes: int64(1), object(1)
memory usage: 247.3+ KB


**Drop News**

In [19]:
train_data = train_data[train_data.sentiment != 2]

In [20]:
sentiments_list = list(train_data.sentiment.unique())
print(sentiments_list)

[1, 0, -1]


### Resampling

In [21]:
from sklearn.utils import resample

In [22]:
class_size = int(len(train_data[train_data['sentiment']==1]))

In [23]:
# seperating the four classes
class_1 = train_data[train_data['sentiment']==-1]
class_2 = train_data[train_data['sentiment']==0]
class_3 = train_data[train_data['sentiment']==1]
#class_4 = train_data[train_data['sentiment']==2]

In [24]:
# upsampling classes 1, 2, and 4 & downsampling class 3
class_1_up = resample(class_1,replace=True,n_samples=class_size, random_state=27)
class_2_up = resample(class_2,replace=True,n_samples=class_size, random_state=27)
#class_4_up = resample(class_4,replace=True,n_samples=class_size, random_state=27)
class_3_down = resample(class_3,replace=False,n_samples=class_size, random_state=27)


In [25]:
# Creating a new DataFrame out of the balanced bata
resampled_df = pd.concat([class_1_up, class_2_up,class_3_down])

## Training

In [26]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(train_data["processed_tweets"], train_data["sentiment"], test_size = 0.2, random_state = 42)


In [27]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
count_vect = CountVectorizer(stop_words='english')
transformer = TfidfTransformer(norm='l2',sublinear_tf=True)

In [28]:
x_train_counts = count_vect.fit_transform(x_train)
x_train_tfidf = transformer.fit_transform(x_train_counts)

In [29]:
print(x_train_counts.shape)
print(x_train_tfidf.shape)

(9743, 15968)
(9743, 15968)


In [30]:
x_train_counts

<9743x15968 sparse matrix of type '<class 'numpy.int64'>'
	with 104950 stored elements in Compressed Sparse Row format>

In [31]:
x_test_counts = count_vect.transform(x_test)
x_test_tfidf = transformer.transform(x_test_counts)

In [32]:
print(x_test_counts.shape)
print(x_test_tfidf.shape)

(2436, 15968)
(2436, 15968)


In [33]:
names = ['Logistic Regression', 'Nearest Neighbors', 
         #'Linear SVM', 'RBF SVM',          
         'Decision Tree', 'Random Forest',  
         'AdaBoost']

In [34]:
classifiers = [
    LogisticRegression(), 
    KNeighborsClassifier(1),
    #SVC(kernel="linear", C=0.025),
    #SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_depth=1, min_samples_leaf = 1),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),    
    AdaBoostClassifier()
]

In [35]:
results = []

models = {}
confusion = {}
class_report = {}


for name, clf in zip(names, classifiers):    
    print ('Fitting {:s} model...'.format(name))
    run_time = %timeit -q -o clf.fit(x_train_tfidf,y_train)
    
    print ('... predicting')
    y_pred = clf.predict(x_train_tfidf)   
    y_pred_test = clf.predict(x_test_tfidf)
    
    print ('... scoring')
    accuracy  = metrics.accuracy_score(y_train, y_pred)
    precision = metrics.precision_score(y_train, y_pred, average='weighted')
    recall    = metrics.recall_score(y_train, y_pred, average='weighted')
    
    f1        = metrics.f1_score(y_train, y_pred, average='weighted')    
    f1_test   = metrics.f1_score(y_test, y_pred_test, average='weighted')    
    
    # Save the results to dictionaries
    models[name] = clf    
    confusion[name] = metrics.confusion_matrix(y_train, y_pred)
    class_report[name] = metrics.classification_report(y_train, y_pred)
    
    results.append([name, accuracy, precision, recall, f1, f1_test, run_time.best])

    
results = pd.DataFrame(results, columns=['Classifier', 'Accuracy', 'Precision', 'Recall', 'F1 Train', 'F1 Test', 'Train Time'])
results.set_index('Classifier', inplace= True)

print ('... All done!')

Fitting Logistic Regression model...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

... predicting
... scoring
Fitting Nearest Neighbors model...
... predicting
... scoring
Fitting Decision Tree model...
... predicting
... scoring


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fitting Random Forest model...
... predicting
... scoring
Fitting AdaBoost model...


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


... predicting
... scoring
... All done!


In [39]:
results.sort_values('F1 Train', ascending=False)

Unnamed: 0_level_0,Accuracy,Precision,Recall,F1 Train,F1 Test,Train Time
Classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Nearest Neighbors,0.999589,0.999589,0.999589,0.999589,0.676166,0.000797
Logistic Regression,0.860105,0.873329,0.860105,0.845977,0.735169,0.798569
AdaBoost,0.745971,0.721786,0.745971,0.713891,0.705028,0.624045
Decision Tree,0.70235,0.493296,0.70235,0.579547,0.566721,0.010598
Random Forest,0.70235,0.493296,0.70235,0.579547,0.566721,0.019666


In [40]:
print(class_report['Nearest Neighbors'])

              precision    recall  f1-score   support

          -1       1.00      1.00      1.00      1040
           0       1.00      1.00      1.00      1860
           1       1.00      1.00      1.00      6843

    accuracy                           1.00      9743
   macro avg       1.00      1.00      1.00      9743
weighted avg       1.00      1.00      1.00      9743



In [41]:
print(class_report['AdaBoost'])

              precision    recall  f1-score   support

          -1       0.72      0.33      0.46      1040
           0       0.53      0.29      0.37      1860
           1       0.77      0.93      0.85      6843

    accuracy                           0.75      9743
   macro avg       0.68      0.52      0.56      9743
weighted avg       0.72      0.75      0.71      9743



In [42]:
#best model
model= KNeighborsClassifier(1)

### Hyperparameter Tuning

In [43]:
from sklearn.model_selection import GridSearchCV


In [44]:
param_grid = {'max_depth': [1,2,3,4,5,6,7,8,9], 'min_samples_leaf' : [1,2,3,4,5,6,7,8,9]}

In [45]:
grid_DT = GridSearchCV(DecisionTreeClassifier(), param_grid, scoring='f1')

In [46]:
grid_DT.fit(x_train_tfidf, y_train)
grid_DT.best_params_

ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].

**preparing for test data**

In [None]:
test_data.info()

In [None]:
test_data['processed_tweet'] = test_data['message'].apply(tweet_cleaning_for_sentiment_analysis)

In [None]:
test_data.head()


In [47]:
drop_features(['message'],test_data)

In [40]:
train_counts = count_vect.fit_transform(train_data['processed_tweets'])
test_counts = count_vect.transform(test_data['processed_tweet'])

In [41]:
print(train_counts.shape)
print(test_counts.shape)

(12179, 18407)
(10546, 18407)


In [48]:
train_tfidf = transformer.fit_transform(train_counts)
test_tfidf = transformer.transform(test_counts)

NameError: name 'train_counts' is not defined

In [43]:

print(train_tfidf.shape)
print(test_tfidf.shape)


(12179, 18407)
(10546, 18407)


In [44]:
model.fit(train_tfidf,train_data['sentiment'])

KNeighborsClassifier(n_neighbors=1)

In [45]:
predictions = model.predict(test_tfidf)

In [46]:
final_result = pd.DataFrame({'tweetid':test_data['tweetid'],'sentiment':predictions})
final_result.to_csv('Output.csv',index=False)

In [47]:
final_result.head()

Unnamed: 0,tweetid,sentiment
0,169760,1
1,35326,1
2,224985,1
3,476263,0
4,872928,1
