In [2]:
# Machine Learning Packages
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.dummy import DummyClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.utils import resample
from sklearn.utils.class_weight import compute_class_weight

# Natural Language Processing (NLP) Libraries
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer, PorterStemmer
import re
import string

# Visualization Libraries
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from lime.lime_text import LimeTextExplainer
from collections import OrderedDict
from IPython.display import display_html

# Data Manipulation and Processing
import pandas as pd
import numpy as np
from collections import Counter
from tabulate import tabulate

# Imbalanced Data Handling
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

import warnings
pd.options.mode.chained_assignment = None


In [3]:
# Read the dataset
df=pd.read_csv("train.csv")

In [4]:
# features & labels
x=df['tweet']
y=df['class']
# Split the Dataset
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [5]:
# Create a pipeline with Count Vectorizer and Multinomial Naive Bayes classifier
pipe_nb_cv = Pipeline(steps=[
    ('cv', CountVectorizer()),
    ('nb', MultinomialNB())
])
# Create a pipeline with Tfidf Vectorizer and Multinomial Naive Bayes classifier
pipe_nb_tfidf = Pipeline(steps=[
    ('tfidf', TfidfVectorizer()),
    ('nb', MultinomialNB())
])
# Create a pipeline with Count Vectorizer and Logistic Regression classifier 
pipe_lr_cv = Pipeline(steps=[
    ('cv', CountVectorizer()),
    ('lr', LogisticRegression())
])
# Create a pipeline with Tfidf Vectorizer and Logistic Regression classifier
pipe_lr_tfidf = Pipeline(steps=[
    ('tfidf',TfidfVectorizer()),
    ('lr', LogisticRegression())
])
# Create a pipeline with Count Vectorizer and RandomForest Classifier
pipe_rf_cv = Pipeline([
    ('cv', CountVectorizer()),
    ('rf', RandomForestClassifier())
])
# Create a pipeline with Tfidf Vectorizer and RandomForest Classifier
pipe_rf_tfidf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('rf', RandomForestClassifier())
])
# Create a pipeline with CountVectorizer and SVM
pipe_svm_cv = Pipeline([
    ('cv', CountVectorizer()),
    ('svm', SVC(kernel='linear', C=1))
])
# Create a pipelin with Tfidf Vectorizer and SVM
pipe_svm_tfidf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('svm', SVC(kernel='linear', C=1))
])

In [8]:
pipelines = [pipe_nb_cv, pipe_nb_tfidf, pipe_lr_cv, pipe_lr_tfidf, pipe_rf_cv, pipe_rf_tfidf, pipe_svm_cv, pipe_svm_tfidf]

for pipe in pipelines:
    pipe.fit(x_train, y_train)
    y_pred = pipe.predict(x_test)
    print(pipe.named_steps, accuracy_score(y_test, y_pred))

{'cv': CountVectorizer(), 'nb': MultinomialNB()} 0.8533387129312084
{'tfidf': TfidfVectorizer(), 'nb': MultinomialNB()} 0.7865644543070406
{'cv': CountVectorizer(), 'lr': LogisticRegression()} 0.8991325398426467
{'tfidf': TfidfVectorizer(), 'lr': LogisticRegression()} 0.8908614081097438
{'cv': CountVectorizer(), 'rf': RandomForestClassifier()} 0.8638289287875731
{'tfidf': TfidfVectorizer(), 'rf': RandomForestClassifier()} 0.8581803510187613
{'cv': CountVectorizer(), 'svm': SVC(C=1, kernel='linear')} 0.8971151906394997
{'tfidf': TfidfVectorizer(), 'svm': SVC(C=1, kernel='linear')} 0.9025620334879968
