## Import libraries

In [2]:
import pandas as pd
import numpy as np
import string
from sklearn.feature_extraction.text import TfidfVectorizer as tvect
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV as GSCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from imblearn.over_sampling import SMOTE

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

## Import data

In [3]:
original_train = pd.read_csv('train.csv')

In [4]:
train_data = original_train.copy()

In [5]:
original_train.head()

Unnamed: 0,sentiment,message,tweetid
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221
1,1,It's not like we lack evidence of anthropogeni...,126103
2,2,RT @RawStory: Researchers say we have three ye...,698562
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year...,573736
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",466954


In [6]:
train_data.head()

Unnamed: 0,sentiment,message,tweetid
0,1,PolySciMajor EPA chief doesn't think carbon di...,625221
1,1,It's not like we lack evidence of anthropogeni...,126103
2,2,RT @RawStory: Researchers say we have three ye...,698562
3,1,#TodayinMaker# WIRED : 2016 was a pivotal year...,573736
4,1,"RT @SoyNovioDeTodas: It's 2016, and a racist, ...",466954


Check for missing values:

In [7]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15819 entries, 0 to 15818
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sentiment  15819 non-null  int64 
 1   message    15819 non-null  object
 2   tweetid    15819 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 370.9+ KB


In [8]:
train_data['sentiment'].isnull().sum()

0

In [9]:
train_data['message'].isnull().sum()

0

In [10]:
train_data['tweetid'].isnull().sum()

0

## Data clean-up

Convert everything to lower case:

In [11]:
train_data['message'] = train_data['message'].str.lower()

Remove punctuation:

In [12]:
def remove_punctuation_numbers(msg):
    punc_numbers = string.punctuation
    return ''.join([l for l in msg if l not in punc_numbers])
train_data['message'] = train_data['message'].apply(remove_punctuation_numbers)

Seperate feature and response:

In [13]:
y_0 = train_data['sentiment']
X_0 = train_data['message']

Remove English stop words:

In [18]:
vectorizer = tvect(ngram_range=(1,2), min_df=2, stop_words="english")
X_trans = vectorizer.fit_transform(X_0)

Sampling for imbalanced data:

In [19]:
oversample = SMOTE()
X, y = oversample.fit_resample(X_trans, y_0)

In [20]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

## Fitting and scoring

In [44]:
model = SVC(C=10, gamma=1)

In [45]:
model.fit(X_train, y_train)

SVC(C=10, gamma=1)

In [42]:
y_pred = model.predict(X_train)   
y_pred_val = model.predict(X_val)
f1 = metrics.f1_score(y_train, y_pred, average='macro')    
f1_test = metrics.f1_score(y_val, y_pred_val, average='macro')
print('F1 Train is ', f1, ' and F1 Test is ', f1_test)

F1 Train is  0.993397948932877  and F1 Test is  0.9264747125021231


## Test data

In [46]:
original_test = pd.read_csv('test.csv')

In [89]:
test_data = original_test.copy()

In [90]:
test_data.head()

Unnamed: 0,message,tweetid
0,Europe will now be looking to China to make su...,169760
1,Combine this with the polling of staffers re c...,35326
2,"The scary, unimpeachable evidence that climate...",224985
3,@Karoli @morgfair @OsborneInk @dailykos \nPuti...,476263
4,RT @FakeWillMoore: 'Female orgasms cause globa...,872928


In [91]:
X_test = test_data['message']

In [72]:
X_test = test_data['message'].str.lower()

In [73]:
X_test = X_test.apply(remove_punctuation_numbers)

In [86]:
X_train.shape

(27296, 22840)

In [92]:
test_vect = vectorizer.transform(X_test)

In [93]:
y_test_pred = model.predict(test_vect)

ValueError: X.shape[1] = 15965 should be equal to 22840, the number of features at training time

In [52]:
test_data['sentiment'] = y_pred

In [53]:
test_data.head()

Unnamed: 0,message,tweetid,sentiment
0,Europe will now be looking to China to make su...,169760,1
1,Combine this with the polling of staffers re c...,35326,1
2,"The scary, unimpeachable evidence that climate...",224985,1
3,@Karoli @morgfair @OsborneInk @dailykos \nPuti...,476263,1
4,RT @FakeWillMoore: 'Female orgasms cause globa...,872928,0


In [55]:
# Final .csv for submission
test_data[['tweetid','sentiment']].to_csv('submission.csv', index=False)