In [1]:
import numpy as np
import pandas as pd
import pickle as pkl
from collections import Counter
import sklearn
import nltk

from nltk.corpus import stopwords 

### Directories

In [2]:
INPUT_DIR = '../data/input/'
OUTPUT_DIR = '../data/output/'

### Read the input train file

In [11]:
def read_input(file_path, debug=False):

    df_data = pd.DataFrame(columns=['label', 'sentence'])
    
    with open(file_path, 'r') as fp:
        line = fp.readline()
        index = 0
        while line:
            sections = line.split('\t')
            label = np.int16(sections[0].strip())
            sentence = str(sections[1].strip())

            if label < 3:
                df_data.loc[index] = [label, sentence]
                index += 1

            line = fp.readline()

    if debug:    
        print(df_data.head())
        print(df_data.describe())
        print(df_data.info())
        labels = np.unique(df_data['label'])
        print(labels)

        label_counter = Counter(df_data['label'])
        print(label_counter)
        
    
    df_data['label'] = df_data['label'].astype(np.uint8)
    return df_data

In [27]:
train_file = INPUT_DIR + 'trn_data'
df_data_train = read_input(train_file, True).copy()

  label                                           sentence
0     0  A cockroach will live nine days without it’s h...
1     0  More people are killed each year from bees tha...
2     0  Well i-, well it seemed to make sense since I ...
3     0                 So, I have none left what so ever.
4     0  You have you have a lot of younger brothers an...
        label sentence
count    1388     1388
unique      3     1382
top         0    yeah.
freq      664        2
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1388 entries, 0 to 1387
Data columns (total 2 columns):
label       1388 non-null object
sentence    1388 non-null object
dtypes: object(2)
memory usage: 32.5+ KB
None
[0 1 2]
Counter({0: 664, 2: 381, 1: 343})


In [28]:
test_file = INPUT_DIR + 'tst_data'
df_data_test = read_input(test_file, True).copy()

  label                                           sentence
0     0  Like if she'll ask me where are my crayons whe...
1     0         Ketchup was sold in the 1830s as medicine.
2     0                                 I need some water.
3     0                                         yeah yeah.
4     0  kind of telling him that, you know, from my re...
        label sentence
count     232      232
unique      3      231
top         2     Why.
freq       85        2
<class 'pandas.core.frame.DataFrame'>
Int64Index: 232 entries, 0 to 231
Data columns (total 2 columns):
label       232 non-null object
sentence    232 non-null object
dtypes: object(2)
memory usage: 5.4+ KB
None
[0 1 2]
Counter({2: 85, 1: 76, 0: 71})


### Label Info: 
    
1. 0 --> 
2. 1 --> 
3. 2 --> 
4. 3 --> NA - Shouldn't be included

### Preprocessing

In [29]:
import re

def preprocess_text(text):
    
    # print(text, end ='')
    
    text = text.lower()
    text = re.sub(r'\W',' ', text)
    text = re.sub(' \d+', ' ', text)
    text = re.sub(r'\s+',' ', text)
        
    words = text.split(' ')
    words = [w.strip() for w in words if w not in stopwords.words('english')]
    
    text = ' '.join(words)
    text = text.strip()
        
    # print(text)
    return text

In [30]:
def preprocess_df(df_data):
    df_data['proc_sentence'] = df_data['sentence'].apply(lambda x: preprocess_text(x))
    df_data.drop(df_data[df_data['proc_sentence'] == ''].index, inplace=True)
    
    return df_data

In [31]:
# Preprocess train & test data frame
print('Train before cleaning', df_data_train.shape)
df_data_train = preprocess_df(df_data_train)
print('Train after cleaning', df_data_train.shape)


print('Test before cleaning', df_data_test.shape)
df_data_test = preprocess_df(df_data_test)
print('Test after cleaning', df_data_test.shape)

Train before cleaning (1388, 2)
Train after cleaning (1358, 3)
Test before cleaning (232, 2)
Test after cleaning (227, 3)


### Create Corpus using only train data

In [32]:
corpus = df_data_train['sentence'].values
print('Corpus Length ', len(corpus))

Corpus Length  1358


### Vectorization

In [33]:
from sklearn.feature_extraction.text import CountVectorizer

## Use Bag of Words Vectorizer for encoding`
vectorizer = CountVectorizer()
vectorizer.fit(corpus)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

### Vectorization of Train

In [34]:
data_train = vectorizer.transform(df_data_train['sentence'])
print('Shape of the data train:',data_train.shape)

Shape of the data train: (1358, 2766)


### Train

In [35]:
label_train = np.array(df_data_train['label'])
label_train = label_train.reshape((len(label_train), 1))
label_train

array([[0],
       [0],
       [0],
       ...,
       [0],
       [0],
       [0]], dtype=uint8)

### Test

In [36]:
data_test = vectorizer.transform(df_data_test['sentence'])
print('Shape of the data train:',data_test.shape)

Shape of the data train: (227, 2766)


In [37]:
label_test = np.array(df_data_test['label'])
label_test = label_test.reshape((len(label_test), 1))

### Estimator

In [38]:
from sklearn.linear_model import LogisticRegression

# Run Logistic Regression
log_regr = LogisticRegression()
log_regr.fit(data_train, label_train)
predictions = log_regr.predict(data_test)

predictions

array([1, 0, 2, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 2, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 2, 1, 0, 0, 0, 1, 2, 2, 0, 0, 0, 0, 0,
       0, 0, 0, 2, 0, 1, 1, 1, 0, 0, 1, 1, 1, 2, 0, 0, 0, 1, 1, 1, 0, 0,
       0, 0, 1, 0, 1, 2, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 0, 1, 0, 1, 2, 1, 0, 1, 2, 1, 2, 1, 1, 0, 1, 0, 1, 1, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 1, 2, 1, 2, 0, 0, 2, 2, 2, 2, 2, 2, 2, 1, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 0, 1, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2,
       0, 2, 2, 0, 1, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 2,
       2, 2, 0, 2, 2, 2, 2, 0, 2, 0, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 0,
       2, 2, 1, 2, 0, 2, 2], dtype=uint8)

In [39]:
from sklearn.metrics import f1_score


## Accuracy Measure
print('Train Accuracy', log_regr.score(data_train, label_train))
print('Test Accuracy', log_regr.score(data_test, label_test))

# F1
f1_measure = f1_score(label_test, predictions, average='macro')
print('F1 macro Score: ', f1_measure)

# Classification report
from sklearn.metrics import classification_report
print(classification_report(label_test, predictions))

Train Accuracy 0.9742268041237113
Test Accuracy 0.6784140969162996
F1 macro Score:  0.6710357243800309
              precision    recall  f1-score   support

           0       0.55      0.72      0.63        71
           1       0.71      0.52      0.60        71
           2       0.80      0.78      0.79        85

    accuracy                           0.68       227
   macro avg       0.69      0.67      0.67       227
weighted avg       0.69      0.68      0.68       227

