### Making Predictions With Our Model

#### Imports

In [30]:
import pickle
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import punkt
from nltk.corpus.reader import wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

#### Import the dataframe for the AVArticles.

In [38]:
# Import the dataframe
df1 = pd.read_pickle('../03. Feature Engineering/Pickles/df.pickle')

# Output a few rows.
df1.head()

Unnamed: 0,FileName,Category,Complete_Filename,Content,Content_Parsed,Category_Code
0,na,tech,na,Markets Tech Media Success Perspectives Videos...,market tech media success perspectives videos ...,4
1,na,tech,na,"Full Episode Tuesday , Sep 7 Close Menu PBS Ne...",full episode tuesday sep 7 close menu pbs new...,4
2,na,tech,na,Accessibility links Skip main content Keyboard...,accessibility link skip main content keyboard ...,4
3,na,tech,na,Skip main content Search Brookings About Us Pr...,skip main content search brook us press room ...,4


#### Trained models

In [33]:
# Set the model file path
file_svcModel = "../04. Model Training/Model/best_svc.pickle"

# Load the model.
with open(file_svcModel, 'rb') as data:
    svc_model = pickle.load(data)



#### TF-IDF object

In [14]:
# Set the file path
path_tfidf = "../03. Feature Engineering/PicklesBBCNews/tfidf.pickle"

# Load the tfidf data.
with open(path_tfidf, 'rb') as data:
    tfidf = pickle.load(data)

#### Category mapping dictionary

In [15]:
# Set the dictionary
category_codes = {
    'business': 0,
    'entertainment': 1,
    'politics': 2,
    'sport': 3,
    'tech': 4
}

#### Feature engineering workflow

In [16]:
# Identify puntuation and stop words.
punctuation_signs = list("?:!.,;")
stop_words = list(stopwords.words('english'))

# This function performs text cleaning to prep the string for nlp
# processes
def create_features_from_text(text):
    
    # Dataframe creation
    lemmatized_text_list = []
    df = pd.DataFrame(columns=['Content'])
    df.loc[0] = text
    
    # Removes \r, \n, whitespaces, and possesive punctuations 
    # and sets to column Content_Parsed_1
    df['Content_Parsed_1'] = df['Content'].str.replace("\r", " ")
    df['Content_Parsed_1'] = df['Content_Parsed_1'].str.replace("\n", " ")
    df['Content_Parsed_1'] = df['Content_Parsed_1'].str.replace("    ", " ")
    df['Content_Parsed_1'] = df['Content_Parsed_1'].str.replace('"', '')
    
    # Creates column Content_Parsed_1 from lowercase valuse of Content_Parsed_1.
    df['Content_Parsed_2'] = df['Content_Parsed_1'].str.lower()
    df['Content_Parsed_3'] = df['Content_Parsed_2']
    
    # Removes punctuation signs in column Content_Parsed_3.
    for punct_sign in punctuation_signs:
        df['Content_Parsed_3'] = df['Content_Parsed_3'].str.replace(punct_sign, '')
        
    # Creates Content_Parsed_4 from possessive's removal Content_Parsed_3.
    df['Content_Parsed_4'] = df['Content_Parsed_3'].str.replace("'s", "")
    
    # Performs lemmatization and creates an empty list.
    wordnet_lemmatizer = WordNetLemmatizer()
    lemmatized_list = []
    
    # Sets text to the first row of column Content_Parsed_4 and
    # splits the text to text_words.
    text = df.loc[0]['Content_Parsed_4']
    text_words = text.split(" ")
    
    # Each word in text_words is lemmatized and appended to
    # lemmatized_list list.
    for word in text_words:
        lemmatized_list.append(wordnet_lemmatizer.lemmatize(word, pos="v"))
        
    lemmatized_text = " ".join(lemmatized_list)    
    lemmatized_text_list.append(lemmatized_text)
    df['Content_Parsed_5'] = lemmatized_text_list
    df['Content_Parsed_6'] = df['Content_Parsed_5']
    
    # regex_stopword is removed in each stop_word in stop_words.
    for stop_word in stop_words:
        regex_stopword = r"\b" + stop_word + r"\b"
        df['Content_Parsed_6'] = df['Content_Parsed_6'].str.replace(regex_stopword, '')
    
    # Dataframe, df, is created from column, Content_Parsed_6.
    df = df['Content_Parsed_6']
    
    # Dataframe column, Content_Parsed_6, is renamed to Content_Parsed.
    df = df.rename(columns={'Content_Parsed_6': 'Content_Parsed'})
    
    # Sets features as an array of tfidf.
    features = tfidf.transform(df).toarray()
    
    return features

Now let's write a function that tells us the category given the category code:

In [17]:
# Function that takes in the category_id number and returns the 
# category name string.
def get_category_name(category_id):
    for category, id_ in category_codes.items():    
        if id_ == category_id:
            return category

Finally, let's write a function that includes the whole process:

In [18]:
# Function that, given a string, uses the model to perform
# a category prediction and outputs the results.
def predict_from_text(text):
    
    # Predict using the input model
    prediction_svc = svc_model.predict(create_features_from_text(text))[0]
    prediction_svc_proba = svc_model.predict_proba(create_features_from_text(text))[0]
    
    # Return result
    category_svc = get_category_name(prediction_svc)
    
    # Output the result message.
    print("The predicted category using the SVM model is %s." %(category_svc) )
    print("The conditional probability is: %a" %(prediction_svc_proba.max()*100))

### Text introduction and prediction

#### *************This is as far as I got. I couldn't get the original dataframes content to a string to move on to the next step.

In [37]:
text=df1.loc[0]['Content']
text

"Markets Tech Media Success Perspectives Videos Edition U.S. International Arabic Español Markets Tech Media Success Perspectives Videos Search Edition U.S. International Arabic Español Markets Premarkets Dow After-Hours Market Movers Fear & Greed World Markets Investing Markets Now Before Bell Leading Indicator Global Energy Challenge Economy Tracking America 's Recovery Energy Money Tech Innovate Gadget Foreseeable Future Mission : Ahead Upstarts Business Evolved Innovative Cities Unhackable Media Reliable Sources Success Boss Files Risk Takers Fresh Money Invest Ahead Work Transformed Cars Homes Wealth Coach Center Piece Perspectives Videos International Switzerland India Davos Reliable Sources Passion Portfolio On : Germany More Accessibility & CC About Us Newsletters US World Politics Business Opinion Health Entertainment Tech Style Travel Sports Videos Audio Coupons Weather More Follow CNN Business Your self-driving car still n't ready . Smarter roads might change By Matt McFarla

In [36]:
predict_from_text(text)

TypeError: rename() got an unexpected keyword argument 'columns'