In [1]:
# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import re
import string

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,HashingVectorizer
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression

In [10]:
train_raw = pd.read_csv('train.csv')
test_raw = pd.read_csv('test.csv')

train_copy = train_raw.copy(deep = True)
test_copy = test_raw.copy(deep = True)

# Text Vectorization Methods

We take a dataset and convert it into a corpus. Then we create a vocabulary of all the unique words in the corpus. Using this vocabulary, we can then create a feature vector of the count of the words.

In [3]:
sentences = ['The quick brown fox', 'The quick brown fox jumps over a lazy dog']

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer.fit(sentences)
vectorizer.vocabulary_

{'the': 7,
 'quick': 6,
 'brown': 0,
 'fox': 2,
 'jumps': 3,
 'over': 5,
 'lazy': 4,
 'dog': 1}

In [6]:
vectorizer.transform(sentences).toarray()

array([[1, 0, 1, 0, 0, 0, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1]], dtype=int64)

### Raw

In [None]:
count_

### Stopwords

In [13]:
stop_words = nltk.corpus.stopwords.words('english')

count_vectorizer = CountVectorizer(stop_words=stop_words)

train_vectros = count_vectorizer.fit_transform(train_copy['text'])
test_vectors = count_vectorizer.transform(test_copy['text'])

train_vectros.shape, test_vectors.shape

((7613, 21498), (3263, 21498))

### Min_DF and Max_DF parameter

MIN_DF lets you ignore those terms that appear rarely in a corpus. In other words, if MIN_dfis 2, it means that a word has to occur at least two documents to be considered useful.

MAX_DF on the other hand, ignores terms that have a document frequency strictly higher than the given threshold.These will be words which appear a lot of documents.

In [14]:
count_vectorizer = CountVectorizer(stop_words=stop_words, min_df=2, max_df=0.8)

train_vectros = count_vectorizer.fit_transform(train_copy['text'])
test_vectors = count_vectorizer.transform(test_copy['text'])

train_vectros.shape, test_vectors.shape

((7613, 6457), (3263, 6457))

### Preprocessing text - REGEX

In [20]:
# Creating a custom preprocessor that lowercases, removes special characters, removes hyperlinks and punctuation

def custom_preprocessor(text):
    
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W"," ",text) # remove special chars
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    
    return text

count_vectorizer = CountVectorizer(list(train_copy['text']),preprocessor=custom_preprocessor)

train_vectors = count_vectorizer.fit_transform(train_copy['text'])
test_vectors = count_vectorizer.transform(test_copy['text'])

train_vectros.shape, test_vectors.shape

((7613, 6457), (3263, 16569))