## Importing 

In [1]:
import numpy as np 
import pandas as pd 
import string
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
# import os

# For printing the path of dataset in Kaggle
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))


## Reading the data set

In [2]:
# stories = pd.read_csv('/kaggle/input/stories-combined/stories_combined.csv')
stories = pd.read_csv('files/stories_combined.csv')
stories.drop('Unnamed: 0', axis=1, inplace=True)

In [3]:
stories.head()

Unnamed: 0,bookno,content,Title,Author,Language,Category
0,51082.txt,*** START OF THIS PROJECT GUTENBERG EBOOK COMI...,Coming Attraction,Fritz Leiber,English,Science Fiction
1,32243.txt,*** START OF THIS PROJECT GUTENBERG EBOOK CONF...,Confidence Game,James McKimmey,English,Science Fiction
2,306-0.txt,*** START OF THIS PROJECT GUTENBERG EBOOK EARL...,"The Early Short Fiction of Edith Wharton, Par...",Edith Wharton,English,Fiction
3,31038.txt,*** START OF THIS PROJECT GUTENBERG EBOOK THE ...,The Real Hard Sell,William W Stuart,English,Science Fiction
4,28636-8.txt,*** START OF THIS PROJECT GUTENBERG EBOOK THE ...,The Grey Woman and other Tales,Mrs. (Elizabeth) Gaskell,English,Fiction


In [4]:
stories['Category'].value_counts()

Science Fiction        54
Fiction                25
Classics               18
Literature             10
Historical Fiction      4
Literary fiction        2
Horror                  2
Sociology Fiction       2
Classical Fiction       2
Childrens Fiction       1
Romance Literature      1
Love Fiction            1
Fantasy Fiction         1
Political Fiction       1
Villages Fiction        1
Fairy Tales             1
Detective Fiction       1
Science fiction         1
Fantasy                 1
Name: Category, dtype: int64

In [5]:
stories['content'] = stories['content'].apply(lambda x: x.lower())

## Cleaning the data.

### Removing the stopwords and punctuations.

In [6]:
my_stop_words = stopwords.words('english')
my_stop_words.extend(["gutenberg", "ebook", "online", "distributed", "transcriber", "etext", "note", "copyright", "start",
            "project", "end", "produced", "proofreading", "team", "http", "www", "pgdp", "net", "illustrated", ])
def text_cleaning(a):
    remove_punctuation = [char for char in a if char not in string.punctuation]
    remove_punctuation = ''.join(remove_punctuation)
    return [word for word in remove_punctuation.split() if word.lower() not in my_stop_words]
    
stories['content'] = stories['content'].apply(text_cleaning)
stories['content'] = stories['content'].apply(lambda x:" ".join(x))

In [7]:
stories

Unnamed: 0,bookno,content,Title,Author,Language,Category
0,51082.txt,coming attraction greg weeks mary meehan httpw...,Coming Attraction,Fritz Leiber,English,Science Fiction
1,32243.txt,confidence game greg weeks david wilson httpww...,Confidence Game,James McKimmey,English,Science Fiction
2,306-0.txt,early short fiction john hamm early short fict...,"The Early Short Fiction of Edith Wharton, Par...",Edith Wharton,English,Fiction
3,31038.txt,real hard sell robert cicconetti david wilson ...,The Real Hard Sell,William W Stuart,English,Science Fiction
4,28636-8.txt,grey woman tales delphine lettau canada httpww...,The Grey Woman and other Tales,Mrs. (Elizabeth) Gaskell,English,Fiction
...,...,...,...,...,...,...
124,7114-8.txt,une vie piece string thomas berger eric eldred...,"Une Vie, A Piece of String and Other Stories",Guy de Maupassant,English,Fiction
125,3077-0.txt,maupassant short stories david widger original...,"Original Short Stories of Maupassant, Volume 1",Guy de Maupassant,English,Fiction
126,1096.txt,faith men jack london 2734 series jack london ...,The Faith of Men,Jack London,English,Fiction
127,59157.txt,escape mechanism greg weeks mary meehan httpww...,Escape Mechanism,Charles E. Fritch,English,Science Fiction


In [8]:
X = stories.iloc[:,1:2]
y = stories['Category']

In [9]:
X

Unnamed: 0,content
0,coming attraction greg weeks mary meehan httpw...
1,confidence game greg weeks david wilson httpww...
2,early short fiction john hamm early short fict...
3,real hard sell robert cicconetti david wilson ...
4,grey woman tales delphine lettau canada httpww...
...,...
124,une vie piece string thomas berger eric eldred...
125,maupassant short stories david widger original...
126,faith men jack london 2734 series jack london ...
127,escape mechanism greg weeks mary meehan httpww...


In [10]:
y


0      Science Fiction
1      Science Fiction
2              Fiction
3      Science Fiction
4              Fiction
            ...       
124            Fiction
125            Fiction
126            Fiction
127    Science Fiction
128            Fiction
Name: Category, Length: 129, dtype: object

### Converting the categories into numeric format.

In [11]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

y = encoder.fit_transform(y)

In [12]:
y

array([15, 15,  7, 15,  7,  8, 15, 16,  2, 15, 15,  8, 15,  2, 15, 11, 15,
        3, 15,  6,  0,  1, 14,  7, 15,  2, 15,  2, 11, 15, 15, 15, 11, 15,
       15, 15,  2,  2, 15, 12, 17, 11, 13, 15, 15, 15, 17, 15, 18,  7,  4,
       15, 15,  7, 15,  2, 15,  7,  2,  9, 11, 15,  7, 10,  5,  2,  7,  2,
       15,  7, 15, 15, 15,  7,  7, 15,  7,  7, 11, 11,  7, 15, 15, 15, 15,
        8, 15,  2,  7, 15, 15,  8,  2, 15,  2,  7,  2,  2, 15,  7, 11,  7,
       15,  1, 15, 15,  2, 10,  7, 15, 15,  2,  9, 15,  2, 15, 15, 15, 15,
        7,  7, 11, 11, 15,  7,  7,  7, 15,  7])

In [13]:
j = 0
print('Number','Category', sep=' \t  ')
for i in y:
    print(i, stories['Category'].iloc[j], sep=' \t- ')
    j += 1

Number 	  Category
15 	- Science Fiction
15 	- Science Fiction
7 	- Fiction
15 	- Science Fiction
7 	- Fiction
8 	- Historical Fiction
15 	- Science Fiction
16 	- Science fiction
2 	- Classics
15 	- Science Fiction
15 	- Science Fiction
8 	- Historical Fiction
15 	- Science Fiction
2 	- Classics
15 	- Science Fiction
11 	- Literature
15 	- Science Fiction
3 	- Detective Fiction
15 	- Science Fiction
6 	- Fantasy Fiction
0 	- Childrens Fiction
1 	- Classical Fiction
14 	- Romance Literature 
7 	- Fiction
15 	- Science Fiction
2 	- Classics
15 	- Science Fiction
2 	- Classics
11 	- Literature
15 	- Science Fiction
15 	- Science Fiction
15 	- Science Fiction
11 	- Literature
15 	- Science Fiction
15 	- Science Fiction
15 	- Science Fiction
2 	- Classics
2 	- Classics
15 	- Science Fiction
12 	- Love Fiction 
17 	- Sociology Fiction
11 	- Literature
13 	- Political Fiction
15 	- Science Fiction
15 	- Science Fiction
15 	- Science Fiction
17 	- Sociology Fiction
15 	- Science Fiction
18 	- 

## Splitting the dataset into test and train with 80:20 ratio.

In [14]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [15]:
X_train.shape

(103, 1)

## Applying Bag of Words for classification using different algorithms.

In [16]:

from sklearn.feature_extraction.text import CountVectorizer

In [17]:
cv = CountVectorizer()

In [18]:
X_train_bow = cv.fit_transform(X_train['content']).toarray()
X_test_bow = cv.transform(X_test['content']).toarray()

In [19]:
X_train_bow.shape

(103, 151410)

## Using Gaussian Naive Bayes Algorithm.

In [20]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

gnb.fit(X_train_bow,y_train)

GaussianNB()

In [21]:
y_pred = gnb.predict(X_test_bow)

from sklearn.metrics import accuracy_score,confusion_matrix
accuracy_score(y_test,y_pred)

0.3076923076923077

### As the accuracy is only 30% with Gaussian Naive Bayes. Trying with different algorithms.

In [22]:
confusion_matrix(y_test,y_pred)

array([[0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 2, 0, 0, 3, 0, 0, 0],
       [0, 0, 3, 0, 0, 2, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 1, 4, 1, 0, 1, 4, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0]])

## Using Random Forest Classification Algorithm.

In [23]:
rf = RandomForestClassifier()

rf.fit(X_train_bow,y_train)
y_pred = rf.predict(X_test_bow)
accuracy_score(y_test,y_pred)


0.5

### The accuracy is 50% with Random Forest.

## Using Random Forest Classification Algorithm with TF-IDF.

In [24]:
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(X_train['content']).toarray()
X_test_tfidf = tfidf.transform(X_test['content'])

In [25]:
rf = RandomForestClassifier()

rf.fit(X_train_tfidf,y_train)
y_pred = rf.predict(X_test_tfidf)

accuracy_score(y_test,y_pred)

0.5

### The accuracy is 53% with Random Forest and TF-IDF.

#### The dataset has only few stories, hence the accuracy is 53%. With increase in the stories to train the model the accuracy might increase.

## Prediction of a story

In [26]:
#s = stories['content'].iloc[4]

In [27]:
#story_test_bow = cv.transform(s.split()).toarray()

In [28]:
#story_pred = rf.predict(story_test_bow)

In [29]:
#story_pred[1]

In [30]:
#s = stories.iloc[5].content

## Recommedation logic

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd

In [32]:
 df_recos = pd.DataFrame(
        columns=[
            "bookno",
            "first_reco",
            "second_reco",
            "third_reco",
            "fourth_reco",
            "fifth_reco",
        ]
    )
# stories = pd.read_csv('../input/stories-combined/stories_combined.csv')
stories = pd.read_csv('files/stories_combined.csv')

def recomendation():   
    my_stop_words = text.ENGLISH_STOP_WORDS.union(
        [
            "gutenberg",
            "ebook",
            "online",
            "distributed",
            "transcriber",
            "etext",
            "note",
            "copyright",
            "start",
            "project",
            "end",
            "produced",
            "proofreading",
            "team",
            "http",
            "www",
            "pgdp",
            "net",
            "illustrated",
        ]
    )
    vectorizer = TfidfVectorizer(stop_words=my_stop_words)
    vectorizer.fit(stories["content"])
    X_vector = vectorizer.transform(stories["content"])
    similarity_matrix = cosine_similarity(X_vector)
    df_recos["bookno"] = np.array(stories["bookno"])
    i = 0
    while i <= df_recos.shape[0] - 1:

        df_recos.iloc[i] = np.take(
            np.array(stories["bookno"]), np.argsort(similarity_matrix)[:, -1:-7:-1][i]
        )
        i += 1
    return "Recommendations done"


In [33]:
recomendation()

'Recommendations done'

In [34]:
df_recos

Unnamed: 0,bookno,first_reco,second_reco,third_reco,fourth_reco,fifth_reco
0,51082.txt,3254.txt,45964-8.txt,15667-8.txt,9363-8.txt,32101-8.txt
1,32243.txt,15667-8.txt,3254.txt,45964-8.txt,2814-0.txt,1646-8.txt
2,306-0.txt,3254.txt,45964-8.txt,9363-8.txt,28636-8.txt,15667-8.txt
3,31038.txt,9363-8.txt,3254.txt,15667-8.txt,1646-8.txt,43558-0.txt
4,28636-8.txt,3254.txt,45964-8.txt,13260-0.txt,15667-8.txt,5835-0.txt
...,...,...,...,...,...,...
124,7114-8.txt,45964-8.txt,3254.txt,3077-0.txt,13260-0.txt,28636-8.txt
125,3077-0.txt,10577-8.txt,45964-8.txt,3254.txt,15667-8.txt,13260-0.txt
126,1096.txt,1655.txt,710.txt,32101-8.txt,45964-8.txt,12336-8.txt
127,59157.txt,51687.txt,45964-8.txt,3254.txt,15667-8.txt,9363-8.txt


In [35]:
df_recos.set_index('bookno').T

bookno,51082.txt,32243.txt,306-0.txt,31038.txt,28636-8.txt,6258.txt,23210-0.txt,51531.txt,3250-0.txt,52776.txt,...,34470-8.txt,5592.txt,512.txt,43558-0.txt,59345.txt,7114-8.txt,3077-0.txt,1096.txt,59157.txt,12336-8.txt
first_reco,3254.txt,15667-8.txt,3254.txt,9363-8.txt,3254.txt,6259.txt,15667-8.txt,3254.txt,15667-8.txt,13707-8.txt,...,3254.txt,13707-8.txt,13707-8.txt,3254.txt,45964-8.txt,45964-8.txt,10577-8.txt,1655.txt,51687.txt,710.txt
second_reco,45964-8.txt,3254.txt,45964-8.txt,3254.txt,45964-8.txt,45964-8.txt,16380-8.txt,15667-8.txt,3254.txt,15667-8.txt,...,28636-8.txt,45964-8.txt,2148-0.txt,15667-8.txt,15667-8.txt,3254.txt,45964-8.txt,710.txt,45964-8.txt,45964-8.txt
third_reco,15667-8.txt,45964-8.txt,9363-8.txt,15667-8.txt,13260-0.txt,3254.txt,13441-8.txt,1646-8.txt,1646-8.txt,3254.txt,...,15667-8.txt,3254.txt,45964-8.txt,49913-0.txt,1646-8.txt,3077-0.txt,3254.txt,32101-8.txt,3254.txt,32101-8.txt
fourth_reco,9363-8.txt,2814-0.txt,28636-8.txt,1646-8.txt,15667-8.txt,15667-8.txt,3254.txt,13441-8.txt,45964-8.txt,1646-8.txt,...,5835-0.txt,13260-0.txt,3254.txt,5835-0.txt,3254.txt,13260-0.txt,15667-8.txt,45964-8.txt,15667-8.txt,1655.txt
fifth_reco,32101-8.txt,1646-8.txt,15667-8.txt,43558-0.txt,5835-0.txt,13260-0.txt,49913-0.txt,9363-8.txt,9363-8.txt,9363-8.txt,...,9363-8.txt,10577-8.txt,13260-0.txt,2814-0.txt,16380-8.txt,28636-8.txt,13260-0.txt,12336-8.txt,9363-8.txt,3254.txt


In [36]:
recomendations_dict = df_recos.set_index('bookno').T.to_dict('list')

In [37]:
import pickle

In [38]:
with open('files/saved_recomendations.pkl', 'wb') as f:
    pickle.dump(recomendations_dict, f)

In [39]:
with open('files/saved_recomendations.pkl', 'rb') as f:
    loaded_dict = pickle.load(f)

In [40]:
loaded_dict['51082.txt']

['3254.txt', '45964-8.txt', '15667-8.txt', '9363-8.txt', '32101-8.txt']

In [41]:
stories

Unnamed: 0.1,Unnamed: 0,bookno,content,Title,Author,Language,Category
0,0,51082.txt,*** START OF THIS PROJECT GUTENBERG EBOOK COMI...,Coming Attraction,Fritz Leiber,English,Science Fiction
1,1,32243.txt,*** START OF THIS PROJECT GUTENBERG EBOOK CONF...,Confidence Game,James McKimmey,English,Science Fiction
2,2,306-0.txt,*** START OF THIS PROJECT GUTENBERG EBOOK EARL...,"The Early Short Fiction of Edith Wharton, Par...",Edith Wharton,English,Fiction
3,3,31038.txt,*** START OF THIS PROJECT GUTENBERG EBOOK THE ...,The Real Hard Sell,William W Stuart,English,Science Fiction
4,4,28636-8.txt,*** START OF THIS PROJECT GUTENBERG EBOOK THE ...,The Grey Woman and other Tales,Mrs. (Elizabeth) Gaskell,English,Fiction
...,...,...,...,...,...,...,...
124,124,7114-8.txt,*** START OF THIS PROJECT GUTENBERG EBOOK UNE ...,"Une Vie, A Piece of String and Other Stories",Guy de Maupassant,English,Fiction
125,125,3077-0.txt,*** START OF THIS PROJECT GUTENBERG EBOOK MAUP...,"Original Short Stories of Maupassant, Volume 1",Guy de Maupassant,English,Fiction
126,126,1096.txt,The Project Gutenberg Etext of The Faith of Me...,The Faith of Men,Jack London,English,Fiction
127,127,59157.txt,*** START OF THIS PROJECT GUTENBERG EBOOK ESCA...,Escape Mechanism,Charles E. Fritch,English,Science Fiction


In [42]:
story_dict = stories[['bookno','Title','content','Author','Category']].set_index('bookno').T.to_dict()

In [43]:
with open('files/saved_stories.pkl', 'wb') as st:
    pickle.dump(story_dict, st)

In [44]:
with open('files/saved_stories.pkl', 'rb') as f:
    loaded_stories = pickle.load(f)

In [45]:
loaded_stories['59157.txt']['Title']

' Escape Mechanism'