In [16]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from keras.models import Sequential
from keras.layers import Dense

# Step 1
### Importing the data

In [2]:
# load ratings data
data0 = pd.read_table("./archive/ml-100k/u.data", delimiter="\t", names=["userId", "itemId", "rating", "timestamp"])
data0.head()

Unnamed: 0,userId,itemId,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [3]:
# load user data
users0 = pd.read_table("./archive/ml-100k/u.user", delimiter="|", names=["userId", "age", "gender", "occupation", "zip"])
users0.head()

Unnamed: 0,userId,age,gender,occupation,zip
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [4]:
# load movie data
movies0 = pd.read_table("./archive/ml-100k/u.item", delimiter="|", names=["movieId", "title", "movieRelease", "videoRelease", "imbd", "unknown", "action", "adventure", "animation", "children", "comedy", "crime", "documentary", "drama", "fantasy", "noir", "horror", "musical", "mystery", "romance", "scifi", "thriller", "war", "western"])
movies0.head()

Unnamed: 0,movieId,title,movieRelease,videoRelease,imbd,unknown,action,adventure,animation,children,...,fantasy,noir,horror,musical,mystery,romance,scifi,thriller,war,western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


# Step 2:
### Cleaning the data

In [5]:
def convertDates(series):
    monthDict = {"Jan" : 1, "Feb" : 2, "Mar" : 3, "Apr" : 4, "May" : 5, "Jun" : 6, "Jul" : 7, "Aug" : 8, "Sep" : 9, "Oct" : 10, "Nov" : 11, "Dec" : 12}
    dates = []
    for val in series:
#         print(val)
        if (type(val) != type(0.1)):
            date = val.split("-")
            numDate = int(date[0]) + 100 * monthDict[date[1]] + 10000 * int(date[2])
            dates.append(numDate)
        else:
            dates.append(-1)
    return dates

def convertZips(series):
    zips = []
    for val in series:
        try:
            zips.append(int(val))
        except:
            zips.append(-1)
    return zips

In [6]:
# clean data
data = data0.drop("timestamp", axis=1)
data.rating.replace({3:1, 4:1, 5:1, 1:0, 2:0}, inplace=True)
data.head()

Unnamed: 0,userId,itemId,rating
0,196,242,1
1,186,302,1
2,22,377,0
3,244,51,0
4,166,346,0


In [7]:
# replace type of zip
users = users0.copy()
users.zip = convertZips(users.zip)
# replace gender for numbers
users.gender.replace({"M" : -1, "F" : 1}, inplace=True)
# replace occupations
cols = list(users.occupation.unique())
users.occupation.replace(cols, np.arange(len(cols)), inplace=True)

users.head()

Unnamed: 0,userId,age,gender,occupation,zip
0,1,24,-1,0,85711
1,2,53,1,1,94043
2,3,23,-1,2,32067
3,4,24,-1,0,43537
4,5,33,1,1,15213


In [8]:
# drop imbd and videoRelease and title
movies = movies0.drop(["videoRelease", "imbd", "title"], axis=1)
# change release date
movies.movieRelease = convertDates(movies.movieRelease)

movies.head()

Unnamed: 0,movieId,movieRelease,unknown,action,adventure,animation,children,comedy,crime,documentary,...,fantasy,noir,horror,musical,mystery,romance,scifi,thriller,war,western
0,1,19950101,0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,19950101,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,19950101,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,19950101,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,19950101,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0


In [9]:
xUsers = users.to_numpy()[:, 1:]
xMovies = movies.to_numpy()[:, 1:]
xData = data.to_numpy()[:, 2]

rows = len(data)
cols = np.shape(xUsers)[1] + np.shape(xMovies)[1] + 1

X = np.zeros((rows, cols))

userIds = data.userId.values
movieIds = data.itemId.values
ratings = data.rating.values

for i in range(rows):
#     print(i)
    u = userIds[i]
    m = movieIds[i]
    X[i] = np.hstack((xUsers[u - 1, :], xMovies[m - 1, : ], ratings[i]))

DATA = pd.DataFrame(X, columns=["age", "gender", "occupation", "zip", "movieRelease", "unknown", "action", "adventure", "animation", "children", "comedy", "crime", "documentary", "drama", "fantasy", "noir", "horror", "musical", "mystery", "romance", "scifi", "thriller", "war", "western", "rating"])
DATA = DATA[DATA["zip"] != -1]
DATA = DATA[DATA["movieRelease"] != -1]

X = DATA.to_numpy()
DATA.head()

Unnamed: 0,age,gender,occupation,zip,movieRelease,unknown,action,adventure,animation,children,...,noir,horror,musical,mystery,romance,scifi,thriller,war,western,rating
0,49.0,-1.0,2.0,55105.0,19970124.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,39.0,1.0,3.0,0.0,19970101.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
2,25.0,-1.0,2.0,40206.0,19940101.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,28.0,-1.0,0.0,80525.0,19940101.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
4,47.0,-1.0,7.0,55113.0,19970101.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Step 3
### Splitting the Data

In [10]:
points = X[:, : -1]
targets = X[:, -1 :]

xTrain, xTest, yTrain, yTest = train_test_split(points, targets, test_size=0.2)

# Step 4
### Training the Model

In [11]:
# define model
model = Sequential()
model.add(Dense(30, input_dim=24, activation="relu"))
model.add(Dense(50, activation="relu"))
model.add(Dense(50, activation="relu"))
model.add(Dense(1, activation="sigmoid"))

In [12]:
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [13]:
model.fit(xTrain, yTrain, epochs=30, batch_size=10)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7fc3c3529a90>

In [14]:
# save model
model.save("myModel")

INFO:tensorflow:Assets written to: myModel/assets


In [21]:
# predict
yPredict = model.predict(xTest)

accuracy_score(yTest, yPredict > 0.5)
# yPredict > 0.5

0.8201828302946734

In [11]:
from numpy import loadtxt
from keras.models import Sequential
from keras.layers import Dense

In [13]:
dataset = loadtxt("trialData.csv", delimiter=",")

X = dataset[:, 0:8]
Y = dataset[:, 8]

In [14]:
model = Sequential()
model.add(Dense(12, input_dim=8, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [15]:
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [16]:
model.fit(X, Y, epochs=150, batch_size=10)

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

Epoch 83/150
Epoch 84/150
Epoch 85/150
Epoch 86/150
Epoch 87/150
Epoch 88/150
Epoch 89/150
Epoch 90/150
Epoch 91/150
Epoch 92/150
Epoch 93/150
Epoch 94/150
Epoch 95/150
Epoch 96/150
Epoch 97/150
Epoch 98/150
Epoch 99/150
Epoch 100/150
Epoch 101/150
Epoch 102/150
Epoch 103/150
Epoch 104/150
Epoch 105/150
Epoch 106/150
Epoch 107/150
Epoch 108/150
Epoch 109/150
Epoch 110/150
Epoch 111/150
Epoch 112/150
Epoch 113/150
Epoch 114/150
Epoch 115/150
Epoch 116/150
Epoch 117/150
Epoch 118/150
Epoch 119/150
Epoch 120/150
Epoch 121/150
Epoch 122/150
Epoch 123/150
Epoch 124/150
Epoch 125/150
Epoch 126/150
Epoch 127/150
Epoch 128/150
Epoch 129/150
Epoch 130/150
Epoch 131/150
Epoch 132/150
Epoch 133/150
Epoch 134/150
Epoch 135/150
Epoch 136/150
Epoch 137/150
Epoch 138/150
Epoch 139/150
Epoch 140/150
Epoch 141/150
Epoch 142/150
Epoch 143/150
Epoch 144/150
Epoch 145/150
Epoch 146/150
Epoch 147/150
Epoch 148/150
Epoch 149/150
Epoch 150/150


<tensorflow.python.keras.callbacks.History at 0x7f2313722b50>

In [17]:
__, accuracy = model.evaluate(X, Y)
print("Accuracy: %.2f" % (accuracy * 100))

Accuracy: 75.78
