In [1]:
import nltk
import string
import re 
import pandas as pd
import numpy as np
import sklearn

In [2]:
test_data = pd.read_csv("election.csv", encoding='utf-8')
train_data = pd.read_csv("SemEval2018-T3-train-taskA.txt", sep='\t')

In [4]:
test_tweets = test_data["Tweets"]
train_tweets = train_data["Tweet text"]

In [5]:
test_classes = test_data['Task A']
train_classes = train_data['Label']

In [6]:
processed_train = train_tweets.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$',
                                  'url')
processed_train = processed_train.str.replace(r'\d+(\.\d+)?', 'number')
processed_test = test_tweets.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$',
                                  'url')
processed_test = processed_test.str.replace(r'\d+(\.\d+)?', 'number')

In [7]:
processed_train = processed_train.str.replace(r'[^\w\d\s]', ' ')

# Replace whitespace between terms with a single space
processed_train = processed_train.str.replace(r'\s+', ' ')

# Remove leading and trailing whitespace
processed_train = processed_train.str.replace(r'^\s+|\s+?$', '')

processed_test = processed_test.str.replace(r'[^\w\d\s]', ' ')

# Replace whitespace between terms with a single space
processed_test = processed_test.str.replace(r'\s+', ' ')

# Remove leading and trailing whitespace
processed_test = processed_test.str.replace(r'^\s+|\s+?$', '')

In [8]:
processed_train = processed_train.str.lower()
processed_test = processed_test.str.lower()

In [9]:
from nltk.corpus import stopwords

# remove stop words from text messages

stop_words = set(stopwords.words('english'))

processed_train = processed_train.apply(lambda x: ' '.join(
    term for term in x.split() if term not in stop_words))

processed_test = processed_test.apply(lambda x: ' '.join(
    term for term in x.split() if term not in stop_words))

In [10]:
ps = nltk.PorterStemmer()

processed_train = processed_train.apply(lambda x: ' '.join(
    ps.stem(term) for term in x.split()))

processed_test = processed_test.apply(lambda x: ' '.join(
    ps.stem(term) for term in x.split()))

In [11]:
from nltk.tokenize import word_tokenize

# create bag-of-words
all_words_train = []
all_words_test = []
for message in processed_train:
    words = word_tokenize(message)
    for w in words:
        all_words_train.append(w)
        
for message in processed_test:
    words = word_tokenize(message)
    for w in words:
        all_words_test.append(w)
        
all_words_train = nltk.FreqDist(all_words_train)
all_words_test = nltk.FreqDist(all_words_test)

In [12]:
word_features_train = list(all_words_train.keys())
word_features_test = list(all_words_test.keys())

In [13]:
def find_features_train(message):
    words = word_tokenize(message)
    features = {}
    for word in word_features_train:
        features[word] = (word in words)

    return features
def find_features_test(message):
    words = word_tokenize(message)
    features = {}
    for word in word_features_test:
        features[word] = (word in words)

    return features

In [14]:
# Now lets do it for all the messages
messages_train = list(zip(processed_train,train_classes))
messages_test = list(zip(processed_test,test_classes))

# define a seed for reproducibility
seed = 42
np.random.seed = seed
np.random.shuffle(messages_train)
np.random.shuffle(messages_test)

# call find_features function for each SMS message
featuresets_train = [(find_features_train(text), label) for (text, label) in messages_train]
featuresets_test = [(find_features_test(text), label) for (text, label) in messages_test]

In [15]:
print(len(featuresets_test))
print(len(featuresets_train))

1081
3817


In [16]:
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Define models to train
names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear","XGBoost"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear'),
    XGBClassifier()
]

models = list(zip(names, classifiers))

for name, model in models:
    nltk_model = SklearnClassifier(model)
    nltk_model.train(featuresets_train)
    accuracy = nltk.classify.accuracy(nltk_model, featuresets_test)*100
    print("{} Accuracy: {}".format(name, accuracy))

In [0]:
from sklearn.ensemble import VotingClassifier

names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear","XGBoost"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear'),
    XGBClassifier()
]
models = list(zip(names, classifiers))

nltk_ensemble = SklearnClassifier(VotingClassifier(estimators = models, voting = 'hard', n_jobs = -1))
nltk_ensemble.train(featuresets_train)
accuracy = nltk.classify.accuracy(nltk_model, featuresets_test)*100
print("Voting Classifier: Accuracy: {}".format(accuracy))

In [0]:
txt_features, labels = zip(*featuresets_test)

prediction = nltk_ensemble.classify_many(txt_features)

In [0]:
print(classification_report(labels, prediction))

pd.DataFrame(
    confusion_matrix(labels, prediction),
    index = [['actual', 'actual'], ['not irony', 'irony']],
    columns = [['predicted', 'predicted'], ['not irony', 'irony']])