# Compare NLP Techniques: Build Model On TF-IDF Vectors

### Read In Cleaned Text

In [1]:
# Load the cleaned training and test sets
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

X_train = pd.read_csv('../../../data/X_train.csv')
X_test = pd.read_csv('../../../data/X_test.csv')
y_train = pd.read_csv('../../../data/y_train.csv')
y_test = pd.read_csv('../../../data/y_test.csv')

X_train.head()

Unnamed: 0,clean_text
0,"['', 'picking', 'various', 'points', 'going', ..."
1,"['dude', 'makin', 'weirdy', 'brownies', 'siste..."
2,"['hen', 'night', 'going', 'swing']"
3,"['hi', '07734396839', 'ibh', 'customer', 'loya..."
4,"['love', 'aathilove', 'u', 'lot']"


### Create TF-IDF Vectors

In [2]:
# Instantiate and fit a TFIDF vectorizer and then use that trained vectorizer
# to transform the messages in the training and test sets
tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(X_train['clean_text'])
X_train_vect = tfidf_vect.transform(X_train['clean_text'])
X_test_vect = tfidf_vect.transform(X_test['clean_text'])

In [3]:
# What words did the vectorizer learn?
tfidf_vect.vocabulary_

{'picking': 5541,
 'various': 7704,
 'points': 5632,
 'going': 3339,
 'yeovil': 8193,
 'motor': 4892,
 'project': 5801,
 'hours': 3716,
 'take': 7111,
 'home': 3670,
 '12': 287,
 '530': 614,
 'max': 4668,
 'easy': 2672,
 'dude': 2638,
 'makin': 4600,
 'weirdy': 7903,
 'brownies': 1634,
 'sister': 6568,
 'made': 4578,
 'awesome': 1272,
 'cookies': 2136,
 'took': 7408,
 'pics': 5543,
 'hen': 3589,
 'night': 5076,
 'swing': 7080,
 'hi': 3613,
 '07734396839': 26,
 'ibh': 3793,
 'customer': 2259,
 'loyalty': 4511,
 'offer': 5227,
 'new': 5053,
 'nokia6600': 5118,
 'mobile': 4833,
 '10': 254,
 'txtauctiontxt': 7542,
 'wordstart': 8045,
 'no81151': 5101,
 'get': 3281,
 'now4t': 5161,
 'love': 4492,
 'aathilove': 829,
 'lot': 4478,
 'well': 7907,
 'keep': 4140,
 'mind': 4775,
 'ive': 3995,
 'got': 3374,
 'enough': 2765,
 'gas': 3242,
 'one': 5272,
 'round': 6192,
 'trip': 7478,
 'barring': 1335,
 'sudden': 6987,
 'influx': 3886,
 'cash': 1801,
 'mystery': 4970,
 'solved': 6687,
 'opened': 5294

In [4]:
# How are these vectors stored?
X_test_vect[0]

<1x8283 sparse matrix of type '<class 'numpy.float64'>'
	with 14 stored elements in Compressed Sparse Row format>

In [5]:
# Can we convert the vectors to arrays?
X_test_vect[0].toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

### Fit RandomForestClassifier On Top Of Vectors

In [9]:
# Fit a basic Random Forest model on these vectors
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vect, y_train.values.ravel())

In [10]:
# Use the trained model to make predictions on the test data
y_pred = rf_model.predict(X_test_vect)

In [11]:
# Evaluate the predictions of the model on the holdout test set
from sklearn.metrics import precision_score, recall_score

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test['label']).sum()/len(y_pred), 3)))

Precision: 1.0 / Recall: 0.827 / Accuracy: 0.978
