In [22]:
# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import re
import string

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,HashingVectorizer
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression

In [23]:
train_raw = pd.read_csv('train.csv')
test_raw = pd.read_csv('test.csv')

train_copy = train_raw.copy(deep = True)
test_copy = test_raw.copy(deep = True)

# Text Vectorization Methods

We take a dataset and convert it into a corpus. Then we create a vocabulary of all the unique words in the corpus. Using this vocabulary, we can then create a feature vector of the count of the words.

In [24]:
sentences = ['The quick brown fox', 'The quick brown fox jumps over a lazy dog']

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer.fit(sentences)
vectorizer.vocabulary_

{'the': 7,
 'quick': 6,
 'brown': 0,
 'fox': 2,
 'jumps': 3,
 'over': 5,
 'lazy': 4,
 'dog': 1}

In [25]:
vectorizer.transform(sentences).toarray()

array([[1, 0, 1, 0, 0, 0, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1]], dtype=int64)

### Raw

In [26]:
count_vectorizer = CountVectorizer()
train_vectors = count_vectorizer.fit_transform(train_copy['text'])
test_vectors = count_vectorizer.transform(test_copy['text'])

train_vectros.shape, test_vectors.shape

((7613, 6457), (3263, 21637))

### Stopwords

In [27]:
stop_words = nltk.corpus.stopwords.words('english')

count_vectorizer = CountVectorizer(stop_words=stop_words)

train_vectros = count_vectorizer.fit_transform(train_copy['text'])
test_vectors = count_vectorizer.transform(test_copy['text'])

train_vectros.shape, test_vectors.shape

((7613, 21498), (3263, 21498))

### Min_DF and Max_DF parameter

MIN_DF lets you ignore those terms that appear rarely in a corpus. In other words, if MIN_dfis 2, it means that a word has to occur at least two documents to be considered useful.

MAX_DF on the other hand, ignores terms that have a document frequency strictly higher than the given threshold.These will be words which appear a lot of documents.

In [32]:
count_vectorizer = CountVectorizer(stop_words=stop_words, min_df=2, max_df=0.8)

train_vectros = count_vectorizer.fit_transform(train_copy['text'])
test_vectors = count_vectorizer.transform(test_copy['text'])

train_vectros.shape, test_vectors.shape

((7613, 6457), (3263, 6457))

### Preprocessing text - REGEX

In [29]:
# Creating a custom preprocessor that lowercases, removes special characters, removes hyperlinks and punctuation

def custom_preprocessor(text):
    
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W"," ",text) # remove special chars
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    
    return text

count_vectorizer = CountVectorizer(list(train_copy['text']),preprocessor=custom_preprocessor)

train_vectors = count_vectorizer.fit_transform(train_copy['text'])
test_vectors = count_vectorizer.transform(test_copy['text'])

train_vectros.shape, test_vectors.shape

((7613, 6457), (3263, 16569))

### N-grams

In [47]:
count_vectorizer = CountVectorizer(list(train_copy['text']),preprocessor=custom_preprocessor, ngram_range=(1,2))

train_vectors = count_vectorizer.fit_transform(train_copy['text'])
test_vectors = count_vectorizer.transform(test_copy['text'])

train_vectros.shape, test_vectors.shape

((7613, 6457), (3263, 77963))

In [48]:
list(count_vectorizer.vocabulary_)[:10]

['our',
 'deeds',
 'are',
 'the',
 'reason',
 'of',
 'this',
 'earthquake',
 'may',
 'allah']

In [49]:
# strict bigrams

count_vectorizer = CountVectorizer(list(train_copy['text']),preprocessor=custom_preprocessor,ngram_range=(2,2))

train_vectors = count_vectorizer.fit_transform(train_copy['text'])
test_vectors = count_vectorizer.transform(test_copy['text'])

train_vectros.shape, test_vectors.shape

((7613, 6457), (3263, 61394))

In [50]:
list(count_vectorizer.vocabulary_)[:10]

['our deeds',
 'deeds are',
 'are the',
 'the reason',
 'reason of',
 'of this',
 'this earthquake',
 'earthquake may',
 'may allah',
 'allah forgive']

In [51]:
# character level bigrams

count_vectorizer = CountVectorizer(list(train_copy['text']),
                                   preprocessor=custom_preprocessor,
                                   ngram_range=(2,2), 
                                   analyzer = 'char_wb')

train_vectors = count_vectorizer.fit_transform(train_copy['text'])
test_vectors = count_vectorizer.transform(test_copy['text'])

train_vectros.shape, test_vectors.shape

((7613, 6457), (3263, 838))

In [52]:
list(count_vectorizer.vocabulary_)[:10]

[' o', 'ou', 'ur', 'r ', ' d', 'de', 'ee', 'ed', 'ds', 's ']

# Creating a Baseline Model with CountVectorizer

In [59]:
count_vectorizer = CountVectorizer(stop_words=stop_words, preprocessor = custom_preprocessor,
                                  ngram_range=(1,2))

train_vectors = count_vectorizer.fit_transform(train_copy['text'])
test_vectors = count_vectorizer.transform(test_copy['text'])

X_train = train_vectors
y_train = train_copy['target']

### Naive Bayes Classifier

In [60]:
nb_classifier = MultinomialNB()
nb_scores = model_selection.cross_val_score(nb_classifier, 
                                         X_train,
                                         y_train,
                                         cv = 5, 
                                         scoring = 'f1')
print(nb_scores)
print(nb_scores.sum()/5)

[0.64890282 0.60940695 0.68431772 0.65236686 0.71437782]
0.6618744355777822


In [63]:
nb_classifier.fit(train_vectors, y_train)

### Logistic Regression

In [68]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(C=1.0)
lr_scores = model_selection.cross_val_score(clf, X_train, y_train, cv=5, scoring="f1")

print(lr_scores)
print(lr_scores.sum()/5)

[0.59632139 0.5311943  0.62033898 0.52423343 0.70342523]
0.595102666987082


In [69]:
clf.fit(train_vectors, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

### Sample Submission

### Naive Bayes

In [65]:
sample_submission = pd.read_csv("sample_submission.csv")
sample_submission["target"] = nb_classifier.predict(test_vectors)
sample_submission.to_csv("FeatureVector_UnscaledNB.csv", index=False)

#RESULT - 0.79141 Accuracy instead of 0.78629 from the previous model. Definitely above the 63% bracket.

### Logistic Reg Classifier

In [70]:
# Submission
sample_submission = pd.read_csv("sample_submission.csv")
sample_submission["target"] = clf.predict(test_vectors)
sample_submission.to_csv("FeatureVector_UnscaledLR.csv", index=False)

# TF-IDF Vectorizer

In [71]:
# word level
tfidf = TfidfVectorizer(analyzer='word',token_pattern=r'\w{1,}',max_features=5000)

train_tfidf = tfidf.fit_transform(train_copy['text'])
test_tfidf = tfidf.transform(test_copy["text"])

In [72]:
#ngram level
tfidf = TfidfVectorizer(analyzer='word',ngram_range=(2,3),token_pattern=r'\w{1,}',max_features=5000)

train_tfidf = tfidf.fit_transform(train_copy['text'])
test_tfidf = tfidf.transform(test_copy["text"])

In [73]:
# characters level
tfidf = TfidfVectorizer(analyzer='char',ngram_range=(2,3),token_pattern=r'\w{1,}',max_features=5000)

train_tfidf = tfidf.fit_transform(train_copy['text'])
test_tfidf = tfidf.transform(test_copy["text"])

# Baseline using TF-IDF

In [75]:
train_tfidf = tfidf.fit_transform(train_copy['text'])
test_tfidf = tfidf.transform(test_copy["text"])

### Logistic Regression

In [77]:
clf = LogisticRegression(C=1.0)
lr_scores = model_selection.cross_val_score(clf, train_tfidf, train_copy["target"], cv=5, scoring="f1")

print(lr_scores)
print(lr_scores.sum()/5)

[0.64338235 0.61563255 0.63263598 0.61979167 0.74009509]
0.6503075288852589


In [78]:
clf.fit(train_tfidf, train_copy["target"])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [79]:
# Submission
sample_submission = pd.read_csv("sample_submission.csv")
sample_submission["target"] = clf.predict(test_tfidf)
sample_submission.to_csv("FeatureVector_ScaledLR.csv", index=False)