# Compare NLP Techniques: Build Model On TF-IDF Vectors

### Read In Cleaned Text

In [11]:
# Load the cleaned training and test sets
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

X_train = pd.read_csv('../../../data/X_train.csv')
X_test = pd.read_csv('../../../data/X_test.csv')
y_train = pd.read_csv('../../../data/y_train.csv')
y_test = pd.read_csv('../../../data/y_test.csv')

X_train.head()

Unnamed: 0,clean_text
0,"['sign', 'maturity', 'start', 'saying', 'big',..."
1,"['mm', 'entirely', 'sure', 'understood', 'text..."
2,"['cant', 'keep', 'talking', 'people', 'sure', ..."
3,"['sorry', 'roommates', 'took', 'forever', 'ok'..."
4,"['escape', 'theatre', 'going', 'watch', 'kaval..."


### Create TF-IDF Vectors

In [12]:
# Instantiate and fit a TFIDF vectorizer and then use that trained vectorizer
# to transform the messages in the training and test sets
tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(X_train['clean_text'])
X_train_vect = tfidf_vect.transform(X_train['clean_text'])
X_test_vect = tfidf_vect.transform(X_test['clean_text'])

In [13]:
# What words did the vectorizer learn?
tfidf_vect.vocabulary_

{'sign': 6531,
 'maturity': 4682,
 'start': 6841,
 'saying': 6299,
 'big': 1411,
 'things': 7250,
 'actually': 874,
 'understanding': 7565,
 'small': 6619,
 'nice': 5081,
 'evening': 2824,
 'bslvyl': 1617,
 'mm': 4842,
 'entirely': 2782,
 'sure': 7026,
 'understood': 7566,
 'text': 7188,
 'hey': 3621,
 'ho': 3668,
 'weekend': 7871,
 'cant': 1739,
 'keep': 4155,
 'talking': 7098,
 'people': 5492,
 'pay': 5467,
 'agree': 939,
 'price': 5768,
 'pls': 5612,
 'tell': 7155,
 'want': 7807,
 'really': 5974,
 'buy': 1661,
 'much': 4948,
 'willing': 7959,
 'sorry': 6711,
 'roommates': 6196,
 'took': 7389,
 'forever': 3116,
 'ok': 5267,
 'come': 2018,
 'escape': 2802,
 'theatre': 7223,
 'going': 3357,
 'watch': 7828,
 'kavalan': 4150,
 'minutes': 4809,
 'didnt': 2454,
 'get': 3297,
 'anything': 1070,
 'da': 2268,
 '88066': 765,
 'lost': 4498,
 '12': 287,
 'help': 3595,
 'let': 4368,
 'know': 4225,
 'youve': 8220,
 'got': 3391,
 'money': 4881,
 'carlos': 1766,
 'make': 4619,
 'call': 1691,
 'yeah'

In [14]:
# How are these vectors stored?
X_test_vect[0]

<1x8263 sparse matrix of type '<class 'numpy.float64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [15]:
# Can we convert the vectors to arrays?
X_test_vect[0].toarray()
print(X_train_vect.shape)
print(len(y_train))
print(y_train)

(4457, 8263)
4457
      label
0         0
1         0
2         0
3         0
4         0
...     ...
4452      0
4453      1
4454      0
4455      0
4456      0

[4457 rows x 1 columns]


### Fit RandomForestClassifier On Top Of Vectors

In [17]:
# Fit a basic Random Forest model on these vectors
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=10)
rf_model = rf.fit(X_train_vect, y_train.values.ravel())

In [18]:
# Use the trained model to make predictions on the test data
y_pred = rf_model.predict(X_test_vect)
print(y_pred)

[0 0 0 ... 0 0 1]


In [19]:
# Evaluate the predictions of the model on the holdout test set
from sklearn.metrics import precision_score, recall_score

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test['label']).sum()/len(y_pred), 3)))

Precision: 0.967 / Recall: 0.783 / Accuracy: 0.967
