In [1]:
import numpy as np
import pandas as pd
import nltk
import re

In [2]:
# Loading the dataset of tweets

df_train=pd.read_csv('../input/train.csv')

In [3]:
# Information about the dataset

print(df_train.info())
print(df_train.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31962 entries, 0 to 31961
Data columns (total 3 columns):
id       31962 non-null int64
label    31962 non-null int64
tweet    31962 non-null object
dtypes: int64(2), object(1)
memory usage: 749.2+ KB
None
   id                        ...                                                                      tweet
0   1                        ...                           @user when a father is dysfunctional and is s...
1   2                        ...                          @user @user thanks for #lyft credit i can't us...
2   3                        ...                                                        bihday your majesty
3   4                        ...                          #model   i love u take with u all the time in ...
4   5                        ...                                     factsguide: society now    #motivation

[5 rows x 3 columns]


In [5]:
# Checking Class Distribution

df_train['label'].value_counts()

0    29720
1     2242
Name: label, dtype: int64

In [6]:
# Storing the tweets and the labels

tweets = df_train['tweet'].str.lower()
Y = df_train['label']

tweets

0         @user when a father is dysfunctional and is s...
1        @user @user thanks for #lyft credit i can't us...
2                                      bihday your majesty
3        #model   i love u take with u all the time in ...
4                   factsguide: society now    #motivation
5        [2/2] huge fan fare and big talking before the...
6         @user camping tomorrow @user @user @user @use...
7        the next school year is the year for exams.ð...
8        we won!!! love the land!!! #allin #cavs #champ...
9         @user @user welcome here !  i'm   it's so #gr...
10        â #ireland consumer price index (mom) climb...
11       we are so selfish. #orlando #standwithorlando ...
12       i get to see my daddy today!!   #80days #getti...
13       @user #cnn calls #michigan middle school 'buil...
14       no comment!  in #australia   #opkillingbay #se...
15       ouch...junior is angryð#got7 #junior #yugyo...
16       i am thankful for having a paner. #thankful #p.

# PREPROCESSING

In [7]:
# Replacing @handle with the word USER

tweets = tweets.str.replace(r'@[\S]+', 'user')

# Replacing the Hast tag with the word hASH

tweets = tweets.str.replace(r'#(\S+)','hash')

# Removing the all the Retweets

tweets = tweets.str.replace(r'\brt\b',' ')

tweets

0         user when a father is dysfunctional and is so...
1        user user thanks for hash credit i can't use c...
2                                      bihday your majesty
3        hash   i love u take with u all the time in ur...
4                          factsguide: society now    hash
5        [2/2] huge fan fare and big talking before the...
6         user camping tomorrow user user user user use...
7        the next school year is the year for exams.ð...
8        we won!!! love the land!!! hash hash hash hash...
9          user user welcome here !  i'm   it's so hash ! 
10        â hash consumer price index (mom) climbed f...
11       we are so selfish. hash hash hash hash hash ha...
12               i get to see my daddy today!!   hash hash
13       user hash calls hash middle school 'build the ...
14        no comment!  in hash   hash hash hash hash  hash
15        ouch...junior is angryðhash hash hash   hash 
16        i am thankful for having a paner. hash hash   

In [8]:
df_train['tweet'].str.extractall(r'((www\.[\S]+)|(http?://[\S]+))')

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2
Unnamed: 0_level_1,match,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1111,0,www.flybcc.com,www.flybcc.com,
6375,0,www...,www...,
8084,0,www.drunk,www.drunk,
8484,0,www.smokeweedeatbacon.com,www.smokeweedeatbacon.com,
8660,0,www.alvarum/heloiseetlespremas,www.alvarum/heloiseetlespremas,
25745,0,www...,www...,


In [9]:
# Replacing the URL or Web Address

tweets = tweets.str.replace(r'((www\.[\S]+)|(http?://[\S]+))','URL')

# Replacing Two or more dots with one

tweets = tweets.str.replace(r'\.{2,}', ' ')

In [10]:
# Removing all the special Characters

tweets = tweets.str.replace(r'[^\w\d\s]',' ')

# Removing all the non ASCII characters

tweets = tweets.str.replace(r'[^\x00-\x7F]+',' ')

# Removing the leading and trailing Whitespaces

tweets = tweets.str.replace(r'^\s+|\s+?$','')

# Replacing multiple Spaces with Single Space

tweets = tweets.str.replace(r'\s+',' ')

tweets

0        user when a father is dysfunctional and is so ...
1        user user thanks for hash credit i can t use c...
2                                      bihday your majesty
3             hash i love u take with u all the time in ur
4                              factsguide society now hash
5        2 2 huge fan fare and big talking before they ...
6        user camping tomorrow user user user user user...
7        the next school year is the year for exams can...
8            we won love the land hash hash hash hash hash
9                  user user welcome here i m it s so hash
10       hash consumer price index mom climbed from pre...
11       we are so selfish hash hash hash hash hash has...
12                   i get to see my daddy today hash hash
13       user hash calls hash middle school build the w...
14             no comment in hash hash hash hash hash hash
15                ouch junior is angry hash hash hash hash
16              i am thankful for having a paner hash ha

In [11]:
# Removing the Stopwords

from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

tweets = tweets.apply(lambda x: ' '.join(word for word in x.split() if word not in stop_words))

In [12]:
# Removing the words stem using Snowball Stemmer

from nltk.stem import *

SS = SnowballStemmer("english")

tweets = tweets.apply(lambda x: ' '.join(SS.stem(word) for word in x.split()))

tweets

0        user father dysfunct selfish drag kid dysfunct...
1        user user thank hash credit use caus offer whe...
2                                           bihday majesti
3                               hash love u take u time ur
4                                   factsguid societi hash
5        2 2 huge fan fare big talk leav chao pay dispu...
6        user camp tomorrow user user user user user us...
7        next school year year exam think hash hash has...
8                       love land hash hash hash hash hash
9                                    user user welcom hash
10       hash consum price index mom climb previous 0 2...
11       selfish hash hash hash hash hash hash hash has...
12                           get see daddi today hash hash
13       user hash call hash middl school build wall ch...
14                   comment hash hash hash hash hash hash
15                   ouch junior angri hash hash hash hash
16                                   thank paner hash ha

In [13]:
from nltk.tokenize import word_tokenize

# Creating a Bag of Words

words = []

for text in tweets:
    word = word_tokenize(text)
    for i in word:
        words.append(i)

In [14]:
from nltk.probability import FreqDist

words = nltk.FreqDist(words)

print ("Total Number of words {}".format(len(words)))
print ("First 30 most common words {}".format(words.most_common(30)))

Total Number of words 16926
First 30 most common words [('hash', 74900), ('user', 17534), ('day', 2554), ('amp', 1749), ('happi', 1740), ('love', 1557), ('get', 1252), ('time', 1211), ('u', 1170), ('go', 1144), ('thank', 1043), ('like', 1032), ('today', 1029), ('make', 966), ('new', 929), ('see', 864), ('one', 835), ('peopl', 832), ('good', 810), ('want', 778), ('father', 746), ('life', 734), ('take', 730), ('look', 710), ('feel', 694), ('need', 654), ('wait', 642), ('come', 636), ('work', 627), ('2', 613)]


In [15]:
# Choosing the first 5000 words as Features

word_features = list(words.keys())[:5000]

In [16]:
# Finding if a word in the word_features is present in the tweets
def finding_features(tweet):
    text = word_tokenize(tweet)
    features={}
    for i in word_features:
        features[i] = (i in text)
    return features

# Zipping the Processed tweets with the Labels
tweets_featlab = zip(tweets, Y)

In [17]:
# Calling the finding_feature function for all the tweets
feature_set = [(finding_features(TW) ,label) for (TW,label) in tweets_featlab]

In [18]:
seed=1
np.random.seed = seed

# Splitting Training and Testing Datasets
from sklearn.model_selection import train_test_split

train, test = train_test_split(feature_set, test_size = 0.50, random_state = seed)

print ('Training Size: {}'.format(len(train)))
print ('Testing Size: {}'.format(len(test)))

Training Size: 15981
Testing Size: 15981


# Modelling

In [19]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [20]:
# Model_1 DecisionTreeClassifier

nltk_model1 = SklearnClassifier(DecisionTreeClassifier())
nltk_model1.train(train)

accuracy = nltk.classify.accuracy(nltk_model1, test)*100
print ("Accuracy of Decision tree: {}".format(accuracy))

Accuracy of Decision tree: 92.06557787372505


In [21]:
# Model_2 Stochastic Gradient Descent

nltk_model2 = SklearnClassifier(SGDClassifier(max_iter = 1000))
nltk_model2.train(train)

accuracy = nltk.classify.accuracy(nltk_model2, test)*100
print ("Accuracy of SGD: {}".format(accuracy))

Accuracy of SGD: 94.4934609849196


In [22]:
# Model_3 RandomForestClassifier

nltk_model3 = SklearnClassifier(RandomForestClassifier())
nltk_model3.train(train)

accuracy = nltk.classify.accuracy(nltk_model3, test)*100
print ("Accuracy of Random Forest: {}".format(accuracy))



Accuracy of Random Forest: 94.23064889556349


In [23]:
# Model_4 Logistic Regression

nltk_model4 = SklearnClassifier(LogisticRegression())
nltk_model4.train(train)

accuracy = nltk.classify.accuracy(nltk_model4, test)*100
print ("Accuracy of Logistic Regression: {}".format(accuracy))



Accuracy of Logistic Regression: 94.5622927226081


In [24]:
test_1, label = zip(*feature_set)

# Classification Report and Confusion Matrix for the Models
models = [nltk_model1, nltk_model2, nltk_model3, nltk_model4]
classifiers = ['Decision Tree', 'SGD', 'Random forest', 'Logistic Regression']
i = 0
for model in models:
    predictions = model.classify_many(test_1)
    class_=classifiers[i]
    print ("Report for {}".format(class_))
    print (classification_report(label, predictions))
    pd.DataFrame(confusion_matrix(label, predictions))
    i+=1

Report for Decision Tree
              precision    recall  f1-score   support

           0       0.98      0.98      0.98     29720
           1       0.71      0.70      0.71      2242

    accuracy                           0.96     31962
   macro avg       0.84      0.84      0.84     31962
weighted avg       0.96      0.96      0.96     31962

Report for SGD
              precision    recall  f1-score   support

           0       0.96      0.99      0.98     29720
           1       0.85      0.44      0.58      2242

    accuracy                           0.96     31962
   macro avg       0.90      0.72      0.78     31962
weighted avg       0.95      0.96      0.95     31962

Report for Random forest
              precision    recall  f1-score   support

           0       0.97      0.99      0.98     29720
           1       0.86      0.64      0.73      2242

    accuracy                           0.97     31962
   macro avg       0.92      0.81      0.86     31962
weighted 