# Initial Jupyter notebook for data mining project with real data


Import all libraries needed for the tutorial

In [1]:
# from pandas import DataFrame, read_csv
import matplotlib.pyplot as plt
import pandas as pd  # this is how I usually import pandas
import sys  # only needed to determine Python version number
from string import punctuation
import re
import nltk
from nltk.stem import StemmerI, RegexpStemmer, LancasterStemmer, ISRIStemmer, PorterStemmer, SnowballStemmer, RSLPStemmer
from nltk import word_tokenize
# nltk.download(u'stopwords')
from nltk.corpus import stopwords

# import enchant
# import hunspell

# Enable inline plotting
%matplotlib inline

In [2]:
# For classification
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.model_selection import train_test_split, GridSearchCV

In [3]:
# import my functions
from myFunctions import *

Read the train data

In [4]:

Location = r'../twitter_data/train2017.tsv'
df = pd.read_csv(Location, sep='\t', names=['ID_1', 'ID_2', 'Label', 'Text'])

# use only a part of csv
df = df[:10000]

# Preprocess the traindata
processed_list = preprocess(df)

# print data
df


Unnamed: 0,ID_1,ID_2,Label,Text
0,264183816548130816,15140428,positive,Gas by my house hit $3.39!!!! I'm going to Cha...
1,263405084770172928,591166521,negative,"Theo Walcott is still shit, watch Rafa and Joh..."
2,262163168678248449,35266263,negative,"its not that I'm a GSP fan, i just hate Nick D..."
3,264249301910310912,18516728,negative,Iranian general says Israel's Iron Dome can't ...
4,262682041215234048,254373818,neutral,"Tehran, Mon Amour: Obama Tried to Establish Ti..."
5,264229576773861376,518129399,neutral,I sat through this whole movie just for Harry ...
6,264105751826538497,147088367,positive,with J Davlar 11th. Main rivals are team Polan...
7,264094586689953794,332474633,negative,"Talking about ACT's && SAT's, deciding where I..."
8,212392538055778304,274996324,neutral,"Why is \""""Happy Valentines Day\"""" trending? It..."
9,254941790757601280,557103111,negative,"They may have a SuperBowl in Dallas, but Dalla..."


read the test data

In [5]:

testLocation = r'../twitter_data/test2017.tsv'
testDf = pd.read_csv(testLocation, sep='\t', names=['ID_1', 'ID_2', 'Label', 'Text'])

# use only a part of csv
testDf = testDf[:10000]

# Preprocess the testData
processed_list_test = preprocess(testDf)

# print data
testDf


Unnamed: 0,ID_1,ID_2,Label,Text
0,801989080477154944,801989080477154944,UNKNOWN,#ArianaGrande Ari By Ariana Grande 80% Full #S...
1,801989272341453952,801989272341453952,UNKNOWN,Ariana Grande KIIS FM Yours Truly CD listening...
2,801990978424962944,801990978424962944,UNKNOWN,Ariana Grande White House Easter Egg Roll in W...
3,801996232553963008,801996232553963008,UNKNOWN,#CD #Musics Ariana Grande Sweet Like Candy 3.4...
4,801998343442407040,801998343442407040,UNKNOWN,SIDE TO SIDE @arianagrande #sidetoside #aria...
5,802001659970744064,802001659970744064,UNKNOWN,Hairspray Live! Previews at the Macy's Thanksg...
6,802003380973568000,802003380973568000,UNKNOWN,#LindsayLohan Is 'Feeling Thankful' After Blas...
7,802014830467174016,802014830467174016,UNKNOWN,I hate her but... I love her songs Dammit ._.#...
8,802020578609623040,802020578609623040,UNKNOWN,Ariana Grande Right There ft. Big Sean # #ar...
9,802021059490934016,802021059490934016,UNKNOWN,which one would you prefer to listen to for a ...


read the correct results

In [6]:

resultsLocation = r'../twitter_data/SemEval2017_task4_subtaskA_test_english_gold.txt'
testResults = pd.read_csv(resultsLocation, sep='\t', names=['ID', 'Label'])

# use only a part of csv
testResults = testResults[:10000]

# print data
testResults


Unnamed: 0,ID,Label
0,801989080477154944,neutral
1,801989272341453952,positive
2,801990978424962944,positive
3,801996232553963008,positive
4,801998343442407040,neutral
5,802001659970744064,positive
6,802003380973568000,positive
7,802014830467174016,neutral
8,802020578609623040,neutral
9,802021059490934016,positive


# Do the classification

In [7]:
# Build label encoder for categories
le = preprocessing.LabelEncoder()
le.fit(df["Label"])

LabelEncoder()

In [8]:
# Transform categories into numbers
y = le.transform(df["Label"])
y_test = le.transform(testResults["Label"])

In [9]:
# get processed content for list
processed_content = [item[1] for item in processed_list]
processed_content_test = [item[1] for item in processed_list_test]
# processed_content

In [10]:
# Vectorize Content
# Choose one of the below

# CountVectorizer (BOW)

# count_vectorizer = CountVectorizer(stop_words=ENGLISH_STOP_WORDS)
# X = count_vectorizer.fit_transform(processed_content)

# TfIdfVectorizer

#train and test vectors should have the same number of features

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(processed_content)
print("X.shape is:")
print(X.shape)

X_test = vectorizer.transform(processed_content_test)
print("X_test.shape is:")
print(X_test.shape)

X.shape is:
(10000, 14498)
X_test.shape is:
(10000, 14498)


Classification using SVM classifier

In [11]:
clf = svm.SVC(kernel='linear')

# fit train set
clf.fit(X, y)

# predict test set (here is the same as the train set)
y_pred = clf.predict(X_test)

# print('\npredictions of test set (which is the same as the train set) are:')
# print(y_pred)

# Transform predictions to text
predicted_categories = le.inverse_transform(y_pred)
# print('\npredictions of test set in text form are:')
# print(predicted_categories)

# Classification_report
print('\nclassification report for these predictions is:')
print(classification_report(y_test, y_pred, target_names=list(le.classes_)))


classification report for these predictions is:
              precision    recall  f1-score   support

    negative       0.61      0.33      0.43      3220
     neutral       0.57      0.74      0.65      4812
    positive       0.49      0.52      0.50      1968

   micro avg       0.56      0.56      0.56     10000
   macro avg       0.56      0.53      0.52     10000
weighted avg       0.57      0.56      0.55     10000



Classification using KNN classifier

In [12]:

# Use KNNClassifier
knn = KNeighborsClassifier(n_neighbors=5)

# fit train set
knn.fit(X, y)

# Predict test set (here is the same as the train set)
y_pred = knn.predict(X)

# print('\npredictions of test set (which is the same as the train set) are:')
# print(y_pred)

# Transform predictions to text
predicted_categories = le.inverse_transform(y_pred)
# print('\npredictions of test set in text form are:')
# print(predicted_categories)

# Classification_report
print('\nclassification report for these predictions is:')
print(classification_report(y_test, y_pred, target_names=list(le.classes_)))



classification report for these predictions is:
              precision    recall  f1-score   support

    negative       0.33      0.08      0.13      3220
     neutral       0.48      0.80      0.60      4812
    positive       0.20      0.12      0.15      1968

   micro avg       0.43      0.43      0.43     10000
   macro avg       0.34      0.33      0.29     10000
weighted avg       0.38      0.43      0.36     10000

