In [1]:
import nltk
import string
import re 
import pandas as pd
import numpy as np
import sklearn
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix


In [2]:
train_data = pd.read_csv("SemEval2018-T3-train-taskA.txt", sep='\t')
train_data.head()

Unnamed: 0,Tweet index,Label,Tweet text
0,1,1,Sweet United Nations video. Just in time for C...
1,2,1,@mrdahl87 We are rumored to have talked to Erv...
2,3,1,Hey there! Nice to see you Minnesota/ND Winter...
3,4,0,3 episodes left I'm dying over here
4,5,1,I can't breathe! was chosen as the most notabl...


In [3]:
test_data = pd.read_csv("D:\\cseproject -- with Anita Mam\\semeval_dataset\\SemEval2018-Task3-master\\datasets\\test_TaskA\\SemEval2018-T3_input_test_taskA.txt", sep='\t')
test_data.head()

Unnamed: 0,tweet index,tweet text
0,1,@Callisto1947 Can U Help?||More conservatives ...
1,2,"Just walked in to #Starbucks and asked for a ""..."
2,3,GONNA WIN http://t.co/Mc9ebqjAqj
3,4,@mickymantell He is exactly that sort of perso...
4,5,So much at work mate 10/10 #boring 100% #dead ...


In [4]:
test_tweets = test_data["tweet text"]
train_tweets = train_data["Tweet text"]

In [5]:
classes = train_data["Label"]

# Preprocessing Data

In [6]:
# replacing urls with word "url"
processed1 = train_tweets.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$',
                                  'url')
processed2 = test_tweets.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$',
                                  'url')

In [7]:
# Replace numbers with ''number''
processed1 = processed1.str.replace(r'\d+(\.\d+)?', 'number')
processed2 = processed2.str.replace(r'\d+(\.\d+)?', 'number')

In [8]:
processed1.head()

0    Sweet United Nations video. Just in time for C...
1    @mrdahlnumber We are rumored to have talked to...
2    Hey there! Nice to see you Minnesota/ND Winter...
3             number episodes left I'm dying over here
4    I can't breathe! was chosen as the most notabl...
Name: Tweet text, dtype: object

In [9]:
processed2.head()

0    @Callistonumber Can U Help?||More conservative...
1    Just walked in to #Starbucks and asked for a "...
2                GONNA WIN http://t.co/McnumberebqjAqj
3    @mickymantell He is exactly that sort of perso...
4    So much at work mate number/number #boring num...
Name: tweet text, dtype: object

In [10]:
# Remove punctuation
processed1 = processed1.str.replace(r'[^\w\d\s]', ' ')
processed2 = processed2.str.replace(r'[^\w\d\s]', ' ')

# Replace whitespace between terms with a single space
processed1 = processed1.str.replace(r'\s+', ' ')
processed2 = processed2.str.replace(r'\s+', ' ')

# Remove leading and trailing whitespace
processed1 = processed1.str.replace(r'^\s+|\s+?$', '')
processed2 = processed2.str.replace(r'^\s+|\s+?$', '')

In [11]:
# change words to lower case
processed1 = processed1.str.lower()
processed2 = processed2.str.lower()

In [12]:
processed1.head()

0    sweet united nations video just in time for ch...
1    mrdahlnumber we are rumored to have talked to ...
2    hey there nice to see you minnesota nd winter ...
3             number episodes left i m dying over here
4    i can t breathe was chosen as the most notable...
Name: Tweet text, dtype: object

In [13]:
processed2.head()

0    callistonumber can u help more conservatives n...
1    just walked in to starbucks and asked for a ta...
2                  gonna win http t co mcnumberebqjaqj
3    mickymantell he is exactly that sort of person...
4    so much at work mate number number boring numb...
Name: tweet text, dtype: object

In [14]:
from nltk.corpus import stopwords

# remove stop words from text messages

stop_words = set(stopwords.words('english'))

processed1 = processed1.apply(lambda x: ' '.join(
    term for term in x.split() if term not in stop_words))

processed2 = processed2.apply(lambda x: ' '.join(
    term for term in x.split() if term not in stop_words))

In [15]:
# Remove word stems using a Porter stemmer
ps = nltk.PorterStemmer()

processed1 = processed1.apply(lambda x: ' '.join(
    ps.stem(term) for term in x.split()))

processed2 = processed2.apply(lambda x: ' '.join(
    ps.stem(term) for term in x.split()))

In [16]:
processed1.head()

0    sweet unit nation video time christma imagin n...
1    mrdahlnumb rumor talk erv agent angel ask ed e...
2             hey nice see minnesota nd winter weather
3                               number episod left die
4    breath chosen notabl quot year annual list rel...
Name: Tweet text, dtype: object

In [17]:
processed2.head()

0    callistonumb u help conserv need tsu get paid ...
1                walk starbuck ask tall blond hahahaha
2                    gonna win http co mcnumberebqjaqj
3               mickymantel exactli sort person weirdo
4    much work mate number number bore number dead ...
Name: tweet text, dtype: object

# Feature Extraction

In [18]:
from nltk.tokenize import word_tokenize

# create bag-of-words
all_words_train = []

for message in processed1:
    words = word_tokenize(message)
    for w in words:
        all_words_train.append(w)
        
all_words_train = nltk.FreqDist(all_words_train)

all_words_test = []

for message in processed2:
    words = word_tokenize(message)
    for w in words:
        all_words_test.append(w)
        
all_words_test = nltk.FreqDist(all_words_test)

In [19]:
# print the total number of words and the 15 most common words
print('Number of words: {}'.format(len(all_words_train)))
print('Most common words: {}'.format(all_words_train.most_common(15)))

print('Number of words: {}'.format(len(all_words_test)))
print('Most common words: {}'.format(all_words_test.most_common(15)))

Number of words: 10633
Most common words: [('co', 943), ('http', 938), ('number', 679), ('love', 284), ('get', 229), ('day', 204), ('like', 196), ('go', 180), ('one', 142), ('work', 132), ('see', 129), ('time', 128), ('peopl', 121), ('great', 119), ('christma', 115)]
Number of words: 3518
Most common words: [('http', 218), ('co', 218), ('number', 120), ('get', 49), ('love', 47), ('like', 41), ('one', 40), ('day', 37), ('work', 33), ('peopl', 33), ('make', 31), ('good', 30), ('today', 29), ('go', 28), ('think', 28)]


In [20]:
# use the 1500 most common words as features
word_features_train = list(all_words_train.keys())[:10500]

word_features_test = list(all_words_test.keys())[:1500]

In [21]:
# The find_features function will determine which of the 1500 word features are contained in the review
def find_features_train(message):
    words = word_tokenize(message)
    features_train = {}
    for word in word_features_train:
        features_train[word] = (word in words)

    return features_train

def find_features_test(message):
    words = word_tokenize(message)
    features_test = {}
    for word in word_features_test:
        features_test[word] = (word in words)

    return features_test

In [22]:
# Now lets do it for all the messages
messages_train = list(zip(processed1,classes))
messages_test = list(processed2)

# define a seed for reproducibility
seed = 42
np.random.seed = seed
np.random.shuffle(messages_train)
np.random.shuffle(messages_test)

# call find_features function for each tweet message
featuresets_train = [(find_features_train(text), label) for (text, label) in messages_train]
featuresets_test = [(find_features_test(text)) for (text) in messages_test]

In [23]:
# we can split the featuresets into training and testing datasets using sklearn
from sklearn import model_selection

# split the data into training and testing datasets
training, testing = model_selection.train_test_split(featuresets_train, test_size = 0.25, random_state=seed)

In [24]:
print(len(training))
print(len(testing))

2862
955


In [25]:
txt_features, labels = zip(*testing)

# Classifier

In [26]:
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import SVC

model = SklearnClassifier(SVC(kernel = 'linear'))

# train the model on the training data
model.train(training)

# and test on the testing dataset!
accuracy = nltk.classify.accuracy(model, testing)*100
print("SVC Accuracy: {}".format(accuracy))

SVC Accuracy: 63.455497382198956


In [27]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from xgboost import XGBClassifier

# Define models to train
names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear","XGBoost"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear'),
    XGBClassifier()
]

models = list(zip(names, classifiers))

for name, model in models:
    nltk_model = SklearnClassifier(model)
    nltk_model.train(training)
    accuracy = nltk.classify.accuracy(nltk_model, testing)*100
    print("{} Accuracy: {}".format(name, accuracy))
    prediction = nltk_model.classify_many(txt_features)
    print(classification_report(labels, prediction))


K Nearest Neighbors Accuracy: 55.811518324607334
              precision    recall  f1-score   support

           0       0.57      0.56      0.56       487
           1       0.55      0.55      0.55       468

    accuracy                           0.56       955
   macro avg       0.56      0.56      0.56       955
weighted avg       0.56      0.56      0.56       955

Decision Tree Accuracy: 59.58115183246073
              precision    recall  f1-score   support

           0       0.59      0.65      0.62       487
           1       0.60      0.54      0.57       468

    accuracy                           0.60       955
   macro avg       0.60      0.59      0.59       955
weighted avg       0.60      0.60      0.59       955





Random Forest Accuracy: 63.769633507853406
              precision    recall  f1-score   support

           0       0.65      0.63      0.64       487
           1       0.63      0.64      0.64       468

    accuracy                           0.64       955
   macro avg       0.64      0.64      0.64       955
weighted avg       0.64      0.64      0.64       955





Logistic Regression Accuracy: 64.92146596858639
              precision    recall  f1-score   support

           0       0.66      0.64      0.65       487
           1       0.64      0.66      0.65       468

    accuracy                           0.65       955
   macro avg       0.65      0.65      0.65       955
weighted avg       0.65      0.65      0.65       955

SGD Classifier Accuracy: 62.5130890052356
              precision    recall  f1-score   support

           0       0.64      0.60      0.62       487
           1       0.61      0.65      0.63       468

    accuracy                           0.63       955
   macro avg       0.63      0.63      0.63       955
weighted avg       0.63      0.63      0.62       955

Naive Bayes Accuracy: 65.44502617801047
              precision    recall  f1-score   support

           0       0.69      0.59      0.63       487
           1       0.63      0.72      0.67       468

    accuracy                           0.65       95

In [28]:
from sklearn.ensemble import VotingClassifier

names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear","XGBoost"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear'),
    XGBClassifier()
]
models = list(zip(names, classifiers))

nltk_ensemble = SklearnClassifier(VotingClassifier(estimators = models, voting = 'hard', n_jobs = -1))
nltk_ensemble.train(training)
accuracy = nltk.classify.accuracy(nltk_model, testing)*100
print("Voting Classifier: Accuracy: {}".format(accuracy))

Voting Classifier: Accuracy: 60.10471204188481


In [29]:
# make class label prediction for testing set

prediction = nltk_ensemble.classify_many(txt_features)

In [30]:
# print a confusion matrix and a classification report
print(classification_report(labels, prediction))

pd.DataFrame(
    confusion_matrix(labels, prediction),
    index = [['actual', 'actual'], ['not irony', 'irony']],
    columns = [['predicted', 'predicted'], ['not irony', 'irony']])

              precision    recall  f1-score   support

           0       0.65      0.67      0.66       487
           1       0.64      0.62      0.63       468

    accuracy                           0.65       955
   macro avg       0.64      0.64      0.64       955
weighted avg       0.64      0.65      0.64       955



Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,not irony,irony
actual,not irony,324,163
actual,irony,176,292
