In [303]:
from IPython.display import Image
from IPython.display import IFrame
from IPython import display
import pandas as pd
import numpy as np

import nltk
import sklearn
import imblearn
import csv
%matplotlib inline

from nltk import TreebankWordTokenizer, SnowballStemmer
from nltk.tokenize import TweetTokenizer
from nltk.tokenize import  word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from imblearn.metrics import classification_report_imbalanced

from sklearn.preprocessing import LabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import VarianceThreshold

from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

import string
import urllib
import math
import re

nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Precious\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Precious\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [304]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test_with_no_labels.csv')

In [305]:
train.head()

Unnamed: 0,sentiment,message,tweetid
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221
1,1,It's not like we lack evidence of anthropogeni...,126103
2,2,RT @RawStory: Researchers say we have three ye...,698562
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year...,573736
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",466954


In [306]:
test.head()

Unnamed: 0,message,tweetid
0,Europe will now be looking to China to make su...,169760
1,Combine this with the polling of staffers re c...,35326
2,"The scary, unimpeachable evidence that climate...",224985
3,@Karoli @morgfair @OsborneInk @dailykos \nPuti...,476263
4,RT @FakeWillMoore: 'Female orgasms cause globa...,872928


In [307]:
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www.\.\S+') 
    return url_pattern.sub(r'',text) 

In [308]:
tweet = train['message'][2]
tweet

'RT @RawStory: Researchers say we have three years to act on climate change before it’s too late https://t.co/WdT0KdUr2f https://t.co/Z0ANPT…'

In [309]:
remove_urls(tweet)

'RT @RawStory: Researchers say we have three years to act on climate change before it’s too late  '

In [310]:
def to_lower(text):
    
    return text.lower()

In [311]:
to_lower(tweet)

'rt @rawstory: researchers say we have three years to act on climate change before it’s too late https://t.co/wdt0kdur2f https://t.co/z0anpt…'

In [312]:
stemmer = SnowballStemmer('english')


def get_roots(text):
    " ".join([stemmer.stem(w) for w in str(text).split()])
    return" ".join([stemmer.stem(w) for w in str(text).split()])

In [313]:
get_roots(tweet)

'rt @rawstory: research say we have three year to act on climat chang befor it too late https://t.co/wdt0kdur2f https://t.co/z0anpt…'

In [314]:
tokenizer = TreebankWordTokenizer()

In [315]:
tokenizer.tokenize(tweet)

['RT',
 '@',
 'RawStory',
 ':',
 'Researchers',
 'say',
 'we',
 'have',
 'three',
 'years',
 'to',
 'act',
 'on',
 'climate',
 'change',
 'before',
 'it’s',
 'too',
 'late',
 'https',
 ':',
 '//t.co/WdT0KdUr2f',
 'https',
 ':',
 '//t.co/Z0ANPT…']

In [316]:
tweet1 = remove_urls(tweet)
tweet1

'RT @RawStory: Researchers say we have three years to act on climate change before it’s too late  '

In [317]:
tweet2 = to_lower(tweet)
tweet2

'rt @rawstory: researchers say we have three years to act on climate change before it’s too late https://t.co/wdt0kdur2f https://t.co/z0anpt…'

In [318]:
tweet3 = get_roots(tweet)
tweet3

'rt @rawstory: research say we have three year to act on climat chang befor it too late https://t.co/wdt0kdur2f https://t.co/z0anpt…'

In [319]:
tweet4 = tokenizer.tokenize(tweet)
tweet4

['RT',
 '@',
 'RawStory',
 ':',
 'Researchers',
 'say',
 'we',
 'have',
 'three',
 'years',
 'to',
 'act',
 'on',
 'climate',
 'change',
 'before',
 'it’s',
 'too',
 'late',
 'https',
 ':',
 '//t.co/WdT0KdUr2f',
 'https',
 ':',
 '//t.co/Z0ANPT…']

In [320]:
train['message'].head()

0    PolySciMajor EPA chief doesn't think carbon di...
1    It's not like we lack evidence of anthropogeni...
2    RT @RawStory: Researchers say we have three ye...
3    #TodayinMaker# WIRED : 2016 was a pivotal year...
4    RT @SoyNovioDeTodas: It's 2016, and a racist, ...
Name: message, dtype: object

In [321]:
train['message_clean'] = train['message'].apply(remove_urls).apply(to_lower).apply(get_roots).apply(tokenizer.tokenize)

In [322]:
train['message_clean'].head()

0    [polyscimajor, epa, chief, does, n't, think, c...
1    [it, not, like, we, lack, evid, of, anthropoge...
2    [rt, @, rawstory, :, research, say, we, have, ...
3    [#, todayinmaker, #, wire, :, 2016, was, a, pi...
4    [rt, @, soynoviodetodas, :, it, 2016, ,, and, ...
Name: message_clean, dtype: object

In [323]:
tt = TfidfVectorizer(preprocessor = list,tokenizer =list,ngram_range=(1,2),min_df = 2,strip_accents = 'ascii',smooth_idf = False)

tt.fit(train['message_clean'])
vect = tt.transform(train['message_clean'])



In [324]:
train_vec =tt.transform(train['message_clean'])
train_vec.shape

(15819, 35761)

In [325]:
train_vec.toarray()

MemoryError: Unable to allocate 4.21 GiB for an array with shape (15819, 35761) and data type float64

In [326]:
y = train['sentiment'].apply(str)
y.head()

0    1
1    1
2    2
3    1
4    1
Name: sentiment, dtype: object

In [327]:
x = train_vec.toarray()
x.shape

MemoryError: Unable to allocate 4.21 GiB for an array with shape (15819, 35761) and data type float64

In [328]:
x

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [329]:
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size = 0.2,stratify = y,random_state= 42)
y_train

ValueError: Found input variables with inconsistent numbers of samples: [10546, 15819]

In [330]:
lr = LogisticRegression(solver= "liblinear",random_state = 42)
model_lr = lr.fit(X_train,y_train)


In [331]:
y_pred_test = lr.predict(X_test)
target_names = ['-1','0','1','2']
y_pred_test

array(['2', '1', '1', ..., '0', '1', '1'], dtype=object)

In [332]:
print (classification_report_imbalanced(y_test,y_pred_test,target_names ))

                   pre       rec       spe        f1       geo       iba       sup

         -1       0.84      0.20      1.00      0.32      0.45      0.18       259
          0       0.70      0.28      0.98      0.40      0.52      0.26       471
          1       0.70      0.93      0.54      0.80      0.71      0.52      1706
          2       0.82      0.73      0.95      0.77      0.84      0.68       728

avg / total       0.74      0.73      0.74      0.70      0.69      0.49      3164



In [333]:
precision_score(y_test,y_pred_test,average ='micro')

0.7285082174462706

In [334]:
f1_score(y_test,y_pred_test,average = 'macro')

0.5746349385850005

In [335]:
pred = lr.predict(X_test)

In [336]:
test.head()

Unnamed: 0,message,tweetid
0,Europe will now be looking to China to make su...,169760
1,Combine this with the polling of staffers re c...,35326
2,"The scary, unimpeachable evidence that climate...",224985
3,@Karoli @morgfair @OsborneInk @dailykos \nPuti...,476263
4,RT @FakeWillMoore: 'Female orgasms cause globa...,872928


In [337]:
test['URL'] = test['message'].str.extractall('(https?://[^>]+)').unstack()

In [338]:
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www.\.\S+')
    return url_pattern.sub(r'',text)

In [339]:
tweet = test['message'][2]
tweet

'The scary, unimpeachable evidence that climate change is already here: https://t.co/yAedqcV9Ki #itstimetochange #climatechange @ZEROCO2_;..'

In [340]:
remove_urls(tweet)

'The scary, unimpeachable evidence that climate change is already here:  #itstimetochange #climatechange @ZEROCO2_;..'

In [341]:
def to_lower(text):
     
    return text.lower()

In [342]:
to_lower(tweet)

'the scary, unimpeachable evidence that climate change is already here: https://t.co/yaedqcv9ki #itstimetochange #climatechange @zeroco2_;..'

In [343]:
stemmer = SnowballStemmer('english')


def get_roots(text):
    " ".join([stemmer.stem(w) for w in str(text).split()])
    return" ".join([stemmer.stem(w) for w in str(text).split()])

In [344]:
get_roots(tweet)

'the scary, unimpeach evid that climat chang is alreadi here: https://t.co/yaedqcv9ki #itstimetochang #climatechang @zeroco2_;..'

In [345]:
tokenizer = TreebankWordTokenizer()

In [346]:
tokenizer.tokenize(tweet)

['The',
 'scary',
 ',',
 'unimpeachable',
 'evidence',
 'that',
 'climate',
 'change',
 'is',
 'already',
 'here',
 ':',
 'https',
 ':',
 '//t.co/yAedqcV9Ki',
 '#',
 'itstimetochange',
 '#',
 'climatechange',
 '@',
 'ZEROCO2_',
 ';',
 '..']

In [347]:
tweet1 = remove_urls(tweet)
tweet1

'The scary, unimpeachable evidence that climate change is already here:  #itstimetochange #climatechange @ZEROCO2_;..'

In [348]:
tweet2 = to_lower(tweet)
tweet2

'the scary, unimpeachable evidence that climate change is already here: https://t.co/yaedqcv9ki #itstimetochange #climatechange @zeroco2_;..'

In [349]:
tweet3 = get_roots(tweet)
tweet3

'the scary, unimpeach evid that climat chang is alreadi here: https://t.co/yaedqcv9ki #itstimetochang #climatechang @zeroco2_;..'

In [350]:
tweet4 = tokenizer.tokenize(tweet)
tweet4

['The',
 'scary',
 ',',
 'unimpeachable',
 'evidence',
 'that',
 'climate',
 'change',
 'is',
 'already',
 'here',
 ':',
 'https',
 ':',
 '//t.co/yAedqcV9Ki',
 '#',
 'itstimetochange',
 '#',
 'climatechange',
 '@',
 'ZEROCO2_',
 ';',
 '..']

In [351]:
test['message_clean']= test['message'].apply(remove_urls).apply(to_lower).apply(get_roots).apply(tokenizer.tokenize)
test['message_clean'].head()

0    [europ, will, now, be, look, to, china, to, ma...
1    [combin, this, with, the, poll, of, staffer, r...
2    [the, scary, ,, unimpeach, evid, that, climat,...
3    [@, karoli, @, morgfair, @, osborneink, @, dai...
4    [rt, @, fakewillmoore, :, femal, orgasm, caus,...
Name: message_clean, dtype: object

In [352]:
#tt = TfidfVectorizer(preprocessor = list,tokenizer =list,ngram_range=(1,2),min_df = 2,strip_accents = 'ascii',smooth_idf = False)

#tt.fit(test['message_clean'])
#vect = tt.transform(train['message_clean'])

In [353]:
test_vec =tt.transform(test['message_clean'])
test_vec.shape

(10546, 35761)

In [354]:
test_vec.toarray()

MemoryError: Unable to allocate 2.81 GiB for an array with shape (10546, 35761) and data type float64

In [None]:
x = test_vec.toarray()
x

In [355]:
print (classification_report_imbalanced(y_pred_test,y_test ))

                   pre       rec       spe        f1       geo       iba       sup

         -1       0.20      0.84      0.93      0.32      0.88      0.78        62
          0       0.28      0.70      0.89      0.40      0.79      0.61       188
          1       0.93      0.70      0.87      0.80      0.78      0.60      2265
          2       0.73      0.82      0.92      0.77      0.87      0.75       649

avg / total       0.84      0.73      0.88      0.76      0.80      0.63      3164



In [356]:
f1_score(y_test,y_pred_test,average = 'macro')

0.5746349385850005

In [357]:
test_pred = lr.predict(test_vec)

In [359]:
pred_df = pd.DataFrame (test['tweetid'])

In [360]:
pred_df['sentiment'] = test_pred

In [361]:
pred_df.to_csv('nafeesa.csv',index = False)