In [None]:
import pickle
import pandas as pd
from sklearn.externals import joblib

### Prerequisites

In order to use the notebook you **must** set the value for the variables in the cell below.  Simply set each variable to a string representing the corresponding filename.  For example:

**BEFORE**
```python
vectorizer_file = # Replace this comment with Vectorizer Filename
labeled_data_file = # Repalce this comment with Labeled Data Csv Filename
tfidf_matrix_file = # Replace this comment with Tfidf Matrix Pkl Filename
model_training_file = # Replace this comment with Model Training Pkl Filename
label_file = # Replace this comment with Label Csv Filename
```

**AFTER**
```python
vectorizer_file = "project_2_vectorizer.pkl"
labeled_data_file = "project_2_labeled_data.csv"
tfidf_matrix_file = "project_2_tfidf_matrix.pkl"
model_training_file = "project_2_training_2.pkl"
label_file = "project_2_labels.csv"
```

In [None]:
vectorizer_file = # Replace this comment with Vectorizer Filename
labeled_data_file = # Repalce this comment with Labeled Data Csv Filename
tfidf_matrix_file = # Replace this comment with Tfidf Matrix Pkl Filename
model_training_file = # Replace this comment with Model Training Pkl Filename
label_file = # Replace this comment with Label Csv Filename

### 4.A. Preprocessing new data for the model to predict

The following sample code can be used to preprocess new data and get it in a format that is usable by the saved model.


**Note:** to see what words the fields in the transformed TFIDF matrix are based off of, just call ```print(tfidf_transformer.vocablulary_)```. This will print a dictionary of the words that were used and their index in the transformed tfidf_matrix.

In [None]:
with open(vectorizer_file, "rb") as tfidf_vectorizer:
    # load the vectorizer
    tfidf_transformer = pickle.load(tfidf_vectorizer)

In [None]:
new_data = [
    "this is new data",
    "that needs to be formatted",
    "In the same way that the other data was before",
]

In [None]:
# apply the vectorizer to new data
transformed_data = tfidf_transformer.transform(new_data)

### 4.B. Applying the model to predict new data

The following sample code can be used to read in the zipped files and get predictions for the unlabeled data.

**NOTE:** the model will predict labels as a number. The project\_\#_labels.csv gives the mapping between label text and ID.

In [None]:
# read in the TFIDF matrix and the labeled data
labeled_frame = pd.read_csv(labeled_data_file)
with open(tfidf_matrix_file,"rb") as tfidf_file:
    tfidf_dict = pickle.load(tfidf_file)

In [None]:
# Subset the TFIDF matrix by the unlabeled data and get as 2D list
labeled_ids = labeled_frame["ID"].tolist()
unlabeled = [tfidf_dict[key] for key in tfidf_dict if key not in labeled_ids]

# read in the model from the pickle file
model = joblib.load(model_training_file)

# apply the model to the unlabeled data
predictions = model.predict(unlabeled)

# print the results
print(predictions)

In [None]:
# open the label dictionary to translate label ID's to their corrosponding text
labels_frame = pd.read_csv(label_file)
label_dict = labels_frame.set_index("Label_ID").to_dict()["Name"]

# get the predictions as actual labels
predictions = [label_dict[pred] for pred in predictions]
print(predictions)