# Initial Jupyter notebook for data mining project


Import all libraries needed for the tutorial

In [1]:
# from pandas import DataFrame, read_csv
import matplotlib.pyplot as plt
import pandas as pd  # this is how I usually import pandas
import sys  # only needed to determine Python version number
from string import punctuation
import re
import nltk
from nltk.stem import StemmerI, RegexpStemmer, LancasterStemmer, ISRIStemmer, PorterStemmer, SnowballStemmer, RSLPStemmer
from nltk import word_tokenize
# nltk.download(u'stopwords')
from nltk.corpus import stopwords

# import enchant
# import hunspell

# Enable inline plotting
%matplotlib inline

In [2]:
# For classification
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.model_selection import train_test_split, GridSearchCV

In [3]:
# import my functions
from myFunctions import *

In [4]:
# Read the data
Location = r'../twitter_data/myTrain.tsv'
df = pd.read_csv(Location, sep='\t', names=['ID_1', 'ID_2', 'Label', 'Text'])

In [5]:
# print data
df

Unnamed: 0,ID_1,ID_2,Label,Text
0,264183816548130816,15140428,positive,#Wow @BB Gas by my house hit $3.39!!!! I'm goi...
1,263405084770172928,591166521,negative,"Theo Walcott is still shit, watch Rafa and Joh..."
2,262163168678248449,35266263,negative,"its not that I'm a GSP fan, i just hate Nick D..."


In [6]:
# Preprocess the data
processed_list = preprocess(df)

In [7]:
# print processed data
processed_list

[(0, 'ga hous hit 339 im go chapel hill sat'),
 (1, 'theo walcott still shit watch rafa johnni deal saturday'),
 (2, 'im gsp fan hate nick diaz cant wait februari')]

# Do the classification

In [8]:
# Build label encoder for categories
le = preprocessing.LabelEncoder()
le.fit(df["Label"])

LabelEncoder()

In [9]:
# Transform categories into numbers
y = le.transform(df["Label"])

In [10]:
# get processed content for list
processed_content = [item[1] for item in processed_list]
processed_content

['ga hous hit 339 im go chapel hill sat',
 'theo walcott still shit watch rafa johnni deal saturday',
 'im gsp fan hate nick diaz cant wait februari']

In [11]:
# Vectorize Content
# Choose one of the below

# CountVectorizer (BOW)

# count_vectorizer = CountVectorizer(stop_words=ENGLISH_STOP_WORDS)
# X = count_vectorizer.fit_transform(processed_content)

# TfIdfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(processed_content)


Classification using SVM classifier

In [12]:
clf = svm.SVC(kernel='linear')

# fit train set
clf.fit(X, y)

# predict test set (here is the same as the train set)
y_pred = clf.predict(X)

print('\npredictions of test set (which is the same as the train set) are:')
print(y_pred)

# Transform predictions to text
predicted_categories = le.inverse_transform(y_pred)
print('\npredictions of test set in text form are:')
print(predicted_categories)

# Classification_report
print('\nclassification report for these predictions is:')
print(classification_report(y, y_pred, target_names=list(le.classes_)))


predictions of test set (which is the same as the train set) are:
[1 0 0]

predictions of test set in text form are:
['positive' 'negative' 'negative']

classification report for these predictions is:
              precision    recall  f1-score   support

    negative       1.00      1.00      1.00         2
    positive       1.00      1.00      1.00         1

   micro avg       1.00      1.00      1.00         3
   macro avg       1.00      1.00      1.00         3
weighted avg       1.00      1.00      1.00         3



Classification using KNN classifier

In [13]:

# Use KNNClassifier
knn = KNeighborsClassifier(n_neighbors=1)

# fit train set
knn.fit(X, y)

# Predict test set (here is the same as the train set)
y_pred = knn.predict(X)

print('\npredictions of test set (which is the same as the train set) are:')
print(y_pred)

# Transform predictions to text
predicted_categories = le.inverse_transform(y_pred)
print('\npredictions of test set in text form are:')
print(predicted_categories)

# Classification_report
print('\nclassification report for these predictions is:')
print(classification_report(y, y_pred, target_names=list(le.classes_)))



predictions of test set (which is the same as the train set) are:
[1 0 0]

predictions of test set in text form are:
['positive' 'negative' 'negative']

classification report for these predictions is:
              precision    recall  f1-score   support

    negative       1.00      1.00      1.00         2
    positive       1.00      1.00      1.00         1

   micro avg       1.00      1.00      1.00         3
   macro avg       1.00      1.00      1.00         3
weighted avg       1.00      1.00      1.00         3

