In [1]:
import pandas as pd
import numpy as np
import os
import time
import gc

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

cwd = os.getcwd()
os.chdir(cwd+'/../../../data/output csv files')

ratings = pd.read_csv('finalratings.csv')
finalbooks = pd.read_csv('finalbooks.csv')
train = pd.read_csv('train.csv')

os.chdir(cwd)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/padmapriya-09/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
import collections

def get_words(message):
    """Get the normalized list of words from a message string.

    This function should split a message into words, normalize them, and return
    the resulting list. For splitting, you should split on spaces. For normalization,
    you should convert everything to lowercase.

    Args:
        message: A string containing an SMS message

    Returns:
       The list of normalized words from the message.
    """


    words = message
    words = words.split(" ")
    words = [x.lower() for x in words]

    return words

def create_dictionary(messages):
    """Create a dictionary mapping words to integer indices.

    Args:
        messages: A list of strings containing SMS messages

    Returns:
        A python dict mapping words to integers.
    """

    word_counts = collections.defaultdict(int)

    for message in messages:
        for word in set(get_words(message)):
            word_counts[word] += 1

    resulting_dictionary = {}

    for word, count in word_counts.items():
        if count >= 25 and word not in stopwords.words('english') and len(word) > 1:
            next_index = len(resulting_dictionary)
            resulting_dictionary[word] = next_index

    return resulting_dictionary

def transform_text(messages, word_dictionary):
    """Transform a list of text messages into a numpy array for further processing.

    Args:
        messages: A list of strings where each string is an SMS message.
        word_dictionary: A python dict mapping words to integers.

    Returns:
        A numpy array marking the words present in each message.
        Where the component (i,j) is the number of occurrences of the
        j-th vocabulary word in the i-th message.
    """

    A = np.zeros((len(messages), len(word_dictionary)))

    for i, message in enumerate(messages):
        for word in get_words(message):
            if word in word_dictionary:
                A[i, word_dictionary[word]] += 1

    return A

def fit_naive_bayes_model(matrix, labels):
    """Fit a naive bayes model.

    Args:
        matrix: A numpy array containing word counts for the training data
        labels: The binary (0 or 1) labels for that training data

    Returns: The trained model
    """


    model = {}

    phi = (1. * sum(labels) / len(labels))*0.95+0.05*0.5
    model['logphi_0'] = np.log(1.-phi)
    model['logphi_1'] = np.log(phi)
    theta_0 = (matrix[labels == 0]).sum(axis=0) + 1
    theta_1 = (matrix[labels == 1]).sum(axis=0) + 1
    theta_0 /= theta_0.sum()
    theta_1 /= theta_1.sum()
    model['logtheta_0'] = np.log(theta_0)
    model['logtheta_1'] = np.log(theta_1)

    return model

def predict_from_naive_bayes_model(model, matrix):
    """Use a Naive Bayes model to compute predictions for a target matrix.

    Args:
        model: A trained model from fit_naive_bayes_model
        matrix: A numpy array containing word counts

    Returns: A numpy array containg the predictions from the model
    """
 
    output = np.zeros(matrix.shape[0])

    logphi_0 = model['logphi_0']
    logphi_1 = model['logphi_1']
    logtheta_0 = model['logtheta_0']
    logtheta_1 = model['logtheta_1']
    logprobs_0 = (matrix * logtheta_0).sum(axis=1) + logphi_0
    logprobs_1 = (matrix * logtheta_1).sum(axis=1) + logphi_1

    output = (logprobs_1/(logprobs_1+logprobs_0))
    return output

In [3]:
finalbooks['snippet'] = finalbooks['snippet'].fillna(finalbooks['title'])
finalbooks['snippet'] = finalbooks['snippet'].str.replace(r'[^\w\s]',"")
finalbooks['snippet'] = finalbooks['snippet'].fillna(finalbooks['tag_cloud'])
dico = create_dictionary(finalbooks['snippet'])
print("Length of dico: %d"%len(dico))
dico

Length of dico: 3136


{'chosen': 0,
 'fight': 1,
 'lives': 2,
 'known': 3,
 'punishment': 4,
 'whole': 5,
 'close': 6,
 '12': 7,
 'grace': 8,
 'bigger': 9,
 'female': 10,
 'america': 11,
 'make': 12,
 'surrounded': 13,
 'claim': 14,
 'sees': 15,
 'early': 16,
 'trained': 17,
 'take': 18,
 'survival': 19,
 'forced': 20,
 'country': 21,
 'formed': 22,
 'north': 23,
 'losing': 24,
 'sister': 25,
 'means': 26,
 'rebellion': 27,
 'hunger': 28,
 'creation': 29,
 'led': 30,
 'male': 31,
 'ages': 32,
 'young': 33,
 'winning': 34,
 'power': 35,
 'stronger': 36,
 'wealthy': 37,
 'selected': 38,
 'certain': 39,
 'history': 40,
 'nature': 41,
 'boy': 42,
 'famous': 43,
 'place': 44,
 'one': 45,
 'system': 46,
 'leaving': 47,
 'death': 48,
 'event': 49,
 'must': 50,
 'nation': 51,
 'girl': 52,
 'destruction': 53,
 'games': 54,
 '18': 55,
 'survivor': 56,
 'second': 57,
 'potter': 58,
 'stone': 59,
 'harry': 60,
 'vampires': 61,
 'love': 62,
 'story': 63,
 'first': 64,
 'didnt': 65,
 'part': 66,
 'know': 67,
 'internatio

In [4]:
# A is numpy array of no.of snippets * no.of words in dico = (8000*3136)
A = transform_text(finalbooks['snippet'], dico)
A.shape

(8000, 3136)

In [5]:
useronly = ratings['newuser_id'].unique()
del ratings

In [6]:
a = finalbooks.filter(['newbookid'])
del finalbooks

In [7]:
indicators = np.zeros(len(dico))
b = pd.DataFrame({'newbookid':pd.Series([], dtype='int64'), 'newuser_id':pd.Series([], dtype='int64'), 'predicted_rating':pd.Series([], dtype='float64')})
b.to_csv('all_predictions_of_naive_bayes.csv',index=False)
top_predictions = pd.DataFrame({'newuser_id':pd.Series([], dtype='int'), 'newbookid':pd.Series([], dtype='int')})
top_predictions.to_csv('top_10_preds_for_each_user_by_naive_bayes.csv', index=False)

start = time.time()

for i in range(len(useronly)):
    User = train.loc[train.newuser_id == i+1].sort_values('newbookid')
    User['binary']= [1 if x >=4 else 0 for x in User['rating']]
    model = fit_naive_bayes_model(A[User['newbookid']-1,:], User['binary'])
    result = predict_from_naive_bayes_model(model, A)
    UserRes = a.filter(['newbookid'])
    UserRes['newuser_id'] = i+1 
    UserRes['predicted_rating'] = result
    UserRes['predicted_rating'] = UserRes['predicted_rating']*4 +1
    UserRes.to_csv('all_predictions_of_naive_bayes.csv', mode = 'a', header = False, index=False)
    
    top_predictions = pd.DataFrame({'newuser_id':pd.Series([], dtype='int'), 'newbookid':pd.Series([], dtype='int')})
    all_books = UserRes.sort_values(by=['predicted_rating'], ascending=False)['newbookid'].tolist()
    read_books = train.loc[train.newuser_id == i+1]['newbookid'].tolist()
    count=0
    k=0
    while(count<10):
        if all_books[k] not in read_books:
            top_predictions = top_predictions.append({'newuser_id': i+1, 'newbookid': all_books[k]}, ignore_index = True) 
            count=count+1
        k=k+1
    top_predictions.to_csv('top_10_preds_for_each_user_by_naive_bayes.csv', mode='a', header=False, index=False)
    
    del UserRes, top_predictions
    indicators = indicators + (model['logtheta_0'] - model['logtheta_1'])
    if (i+1)%1000 == 0: print("done: ", i+1)

end = time.time()
print("Completed in %d seconds"%(end-start))

del train, A, dico
gc.collect()

done:  1000
done:  2000
done:  3000
done:  4000
done:  5000
done:  6000
done:  7000
done:  8000
done:  9000
done:  10000
done:  11000
done:  12000
done:  13000
done:  14000
done:  15000
Completed in 2186 seconds


0