In [1]:
import nltk
import string
import re 
import pandas as pd
import numpy as np
import sklearn
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [2]:
df = pd.read_csv("election.csv", encoding='utf-8')
df2 = pd.read_csv("election1.csv",encoding='utf-8')
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1081 entries, 0 to 1080
Data columns (total 2 columns):
Tweets    1081 non-null object
Task A    1081 non-null int64
dtypes: int64(1), object(1)
memory usage: 17.0+ KB


Unnamed: 0,Tweets,Task A
0,As the Bharatiya Janata Party (BJP) looks set ...,0
1,During the 1992 United States presidential el...,0
2,I don't tell lies like Modi ji. I believe in G...,1
3,"The BJP has a vision of India: one nation, on...",0
4,The Supreme Court on May 22 granted protection...,0


In [3]:
tweets = df["Tweets"]
tweets[0:20]

0     As the Bharatiya Janata Party (BJP) looks set ...
1      During the 1992 United States presidential el...
2     I don't tell lies like Modi ji. I believe in G...
3      The BJP has a vision of India: one nation, on...
4     The Supreme Court on May 22 granted protection...
5       Many see the election as a referendum on Mr ...
6     PATNA: RJD chief Lalu Prasad’s elder son Tej P...
7      NEW DELHI (AP) The Latest on India's general ...
8      NEW DELHI (AP) Indians voted Sunday in the ne...
9      Arrah in western Bihar, a constituency weighe...
10     Ramalinga Reddy is the MLA of BTM Layout cons...
11     New York: Time magazine, which published a co...
12    Former IAS officer Arjun Ram Meghwal won the 2...
13     Languages spoken in India belong to several l...
14     BSP President Mayawati during an election cam...
15     In the tweet, Kejriwal tagged Modi\u2019s twe...
16     BJP president Amit Shah will make his Lok Sab...
17    BJP candidate Tejaswi Surya was able to re

# Preprocessing Data

In [4]:
classes = df['Task A']
classes2 = df2['Task A']
classes.value_counts()

0    749
1    332
Name: Task A, dtype: int64

In [5]:
classes2.value_counts()

0    759
1    322
Name: Task A, dtype: int64

In [6]:
#Inter-rater agreement Kappas (value changed at an interval of 16 in csv)
from sklearn.metrics import cohen_kappa_score
cohen_kappa_score(classes, classes2)

0.9780804029474862

In [7]:
# replacing urls with word "url"
processed = tweets.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$',
                                  'url')

In [8]:
# Replace numbers with ''number''
processed = processed.str.replace(r'\d+(\.\d+)?', 'number')

In [9]:
processed.head()

0    As the Bharatiya Janata Party (BJP) looks set ...
1     During the number United States presidential ...
2    I don't tell lies like Modi ji. I believe in G...
3     The BJP has a vision of India: one nation, on...
4    The Supreme Court on May number granted protec...
Name: Tweets, dtype: object

In [10]:
# Remove punctuation
processed = processed.str.replace(r'[^\w\d\s]', ' ')

# Replace whitespace between terms with a single space
processed = processed.str.replace(r'\s+', ' ')

# Remove leading and trailing whitespace
processed = processed.str.replace(r'^\s+|\s+?$', '')

In [11]:
# change words to lower case
processed = processed.str.lower()

In [12]:
processed.head()

0    as the bharatiya janata party bjp looks set fo...
1    during the number united states presidential e...
2    i don t tell lies like modi ji i believe in ga...
3    the bjp has a vision of india one nation one h...
4    the supreme court on may number granted protec...
Name: Tweets, dtype: object

In [13]:
from nltk.corpus import stopwords

# remove stop words from text messages

stop_words = set(stopwords.words('english'))

processed = processed.apply(lambda x: ' '.join(
    term for term in x.split() if term not in stop_words))

In [14]:
# Remove word stems using a Porter stemmer
ps = nltk.PorterStemmer()

processed = processed.apply(lambda x: ' '.join(
    ps.stem(term) for term in x.split()))

In [15]:
processed.head()

0    bharatiya janata parti bjp look set massiv unp...
1    number unit state presidenti elect campaign sl...
2    tell lie like modi ji believ gandhi ji alway t...
3    bjp vision india one nation one histori one cu...
4    suprem court may number grant protect arrest t...
Name: Tweets, dtype: object

# Feature Extraction

In [16]:
from nltk.tokenize import word_tokenize

# create bag-of-words
all_words = []

for message in processed:
    words = word_tokenize(message)
    for w in words:
        all_words.append(w)
        
all_words = nltk.FreqDist(all_words)

In [17]:
# print the total number of words and the 15 most common words
print('Number of words: {}'.format(len(all_words)))
print('Most common words: {}'.format(all_words.most_common(15)))

Number of words: 4830
Most common words: [('number', 1464), ('unumb', 691), ('char', 552), ('r', 497), ('modi', 491), ('elect', 482), ('minist', 405), ('parti', 404), ('bjp', 358), ('prime', 282), ('narendra', 267), ('congress', 240), ('india', 233), ('lok', 227), ('sabha', 220)]


In [18]:
# use the 4830 most common words as features
word_features = list(all_words.keys())[:4830]

In [19]:
# The find_features function will determine which of the 1500 word features are contained in the review
def find_features(message):
    words = word_tokenize(message)
    features = {}
    for word in word_features:
        features[word] = (word in words)

    return features
# # Lets see an example!
# features = find_features(processed[0])
# for key, value in features.items():
#     if value == True:
#         print( key)

In [20]:
# Now lets do it for all the messages
messages = list(zip(processed,classes))

# define a seed for reproducibility
seed = 42
np.random.seed = seed
np.random.shuffle(messages)

# call find_features function for each SMS message
featuresets = [(find_features(text), label) for (text, label) in messages]

In [21]:
# we can split the featuresets into training and testing datasets using sklearn
from sklearn import model_selection

# split the data into training and testing datasets
training, testing = model_selection.train_test_split(featuresets, test_size = 0.25, random_state=seed)

In [22]:
print(len(training))
print(len(testing))

810
271


In [23]:
txt_features, labels = zip(*testing)

# Classifier

In [24]:
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import SVC

model = SklearnClassifier(SVC(kernel = 'linear'))

# train the model on the training data
model.train(training)

# and test on the testing dataset!
accuracy = nltk.classify.accuracy(model, testing)*100
prediction = model.classify_many(txt_features)
print(classification_report(labels, prediction))
print("SVC Accuracy: {}".format(accuracy))

precision    recall  f1-score   support

           0       0.91      0.85      0.88       194
           1       0.67      0.79      0.73        77

    accuracy                           0.83       271
   macro avg       0.79      0.82      0.80       271
weighted avg       0.84      0.83      0.83       271

SVC Accuracy: 83.02583025830258


In [25]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from xgboost import XGBClassifier


# Define models to train
names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear","XGBoost"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear'),
    XGBClassifier()
]

models = list(zip(names, classifiers))

for name, model in models:
    nltk_model = SklearnClassifier(model)
    nltk_model.train(training)
    accuracy = nltk.classify.accuracy(nltk_model, testing)*100
    prediction = nltk_model.classify_many(txt_features)
    print("{} Accuracy: {}".format(name, accuracy))
    print(classification_report(labels, prediction))

K Nearest Neighbors Accuracy: 60.51660516605166
              precision    recall  f1-score   support

           0       0.90      0.51      0.65       194
           1       0.41      0.86      0.55        77

    accuracy                           0.61       271
   macro avg       0.65      0.68      0.60       271
weighted avg       0.76      0.61      0.62       271

Decision Tree Accuracy: 78.22878228782287
              precision    recall  f1-score   support

           0       0.88      0.80      0.84       194
           1       0.60      0.73      0.65        77

    accuracy                           0.78       271
   macro avg       0.74      0.77      0.75       271
weighted avg       0.80      0.78      0.79       271

Random Forest Accuracy: 84.50184501845018
              precision    recall  f1-score   support

           0       0.88      0.91      0.89       194
           1       0.75      0.68      0.71        77

    accuracy                           0.85       

In [26]:
from sklearn.ensemble import VotingClassifier

names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear","XGBoost"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear'),
    XGBClassifier()
]
models = list(zip(names, classifiers))

nltk_ensemble = SklearnClassifier(VotingClassifier(estimators = models, voting = 'hard', n_jobs = -1))
nltk_ensemble.train(training)
accuracy = nltk.classify.accuracy(nltk_ensemble, testing)*100
print("voting classifier accuracy : ", accuracy)

voting classifier accuracy :  85.97785977859779


In [27]:
# make class label prediction for testing set
prediction = nltk_ensemble.classify_many(txt_features)

In [28]:
# print a confusion matrix and a classification report
print(classification_report(labels, prediction))

pd.DataFrame(
    confusion_matrix(labels, prediction),
    index = [['actual', 'actual'], ['not irony', 'irony']],
    columns = [['predicted', 'predicted'], ['not irony', 'irony']])

precision    recall  f1-score   support

           0       0.91      0.89      0.90       194
           1       0.73      0.79      0.76        77

    accuracy                           0.86       271
   macro avg       0.82      0.84      0.83       271
weighted avg       0.86      0.86      0.86       271



Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,not irony,irony
actual,not irony,172,22
actual,irony,16,61
