# Compare NLP Techniques: Build Model On TF-IDF Vectors

### Read In Cleaned Text

In [2]:
# Load the cleaned training and test sets
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

X_train = pd.read_csv('../../../data/X_train.csv')
X_test = pd.read_csv('../../../data/X_test.csv')
y_train = pd.read_csv('../../../data/y_train.csv')
y_test = pd.read_csv('../../../data/y_test.csv')

X_train.head()

Unnamed: 0,clean_text
0,"['told', 'number', 'gautham']"
1,"['lol', 'yeah', 'point', 'guess']"
2,"['haha', 'okay', 'today', 'weekend', 'leh', '']"
3,"['said', 'look', 'pretty', 'wif', 'long', 'hai..."
4,"['ltgt', 'g', 'saw', 'days', 'ago', 'guy', 'wa..."


### Create TF-IDF Vectors

In [3]:
# Instantiate and fit a TFIDF vectorizer and then use that trained vectorizer
# to transform the messages in the training and test sets
tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(X_train['clean_text'])
X_train_vect = tfidf_vect.transform(X_train['clean_text'])
X_test_vect = tfidf_vect.transform(X_test['clean_text'])

In [4]:
# What words did the vectorizer learn?
tfidf_vect.vocabulary_

{'told': 7435,
 'number': 5239,
 'gautham': 3276,
 'lol': 4502,
 'yeah': 8238,
 'point': 5696,
 'guess': 3479,
 'haha': 3506,
 'okay': 5309,
 'today': 7421,
 'weekend': 7948,
 'leh': 4381,
 'said': 6300,
 'look': 4511,
 'pretty': 5815,
 'wif': 8020,
 'long': 4507,
 'hair': 3510,
 'wat': 7902,
 'thk': 7332,
 'hes': 3640,
 'cutting': 2272,
 'quite': 5954,
 'short': 6541,
 'ltgt': 4575,
 'saw': 6343,
 'days': 2337,
 'ago': 959,
 'guy': 3491,
 'wants': 7886,
 'sell': 6423,
 'wifi': 8024,
 '3g': 508,
 'thats': 7290,
 'blanked': 1480,
 'finished': 3044,
 'class': 1960,
 'garbage': 3267,
 'bags': 1298,
 'eggs': 2726,
 'jam': 4046,
 'bread': 1591,
 'hannaford': 3537,
 'wheat': 7991,
 'chex': 1895,
 'ask': 1191,
 'iouri': 3983,
 'ive': 4033,
 'story': 6967,
 'like': 4424,
 'ten': 7239,
 'times': 7382,
 'already': 1024,
 'left': 4375,
 'dessert': 2439,
 'wan': 7878,
 'go': 3357,
 'suntec': 7074,
 'oh': 5301,
 'icic': 3833,
 'lor': 4522,
 'den': 2410,
 'meet': 4754,
 'day': 2332,
 'guaranteed': 3

In [5]:
# How are these vectors stored?
# Sparse Vector/Matrix
X_test_vect[0]

<1x8341 sparse matrix of type '<class 'numpy.float64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [6]:
# Can we convert the vectors to arrays?
X_test_vect[0].toarray() #.toarray method

array([[0., 0., 0., ..., 0., 0., 0.]])

### Fit RandomForestClassifier On Top Of Vectors

In [7]:
# Fit a basic Random Forest model on these vectors
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vect, y_train.values.ravel())#values.ravel converts columns to array that sk-learn likes

In [8]:
# Use the trained model to make predictions on the test data
y_pred = rf_model.predict(X_test_vect)

In [9]:
# Evaluate the predictions of the model on the holdout test set
from sklearn.metrics import precision_score, recall_score

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test['label']).sum()/len(y_pred), 3)))

Precision: 1.0 / Recall: 0.797 / Accuracy: 0.974
