### Medium Search Engine - Processing Data

In [18]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from string import punctuation
from sklearn.preprocessing import MultiLabelBinarizer
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score

In [2]:
medium_stories = pd.read_csv('Medium_Clean.zip',compression='zip')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
medium_stories.drop("Unnamed: 0",axis=1,inplace=True)

In [4]:
medium_stories.head(2)

Unnamed: 0,Title,Subtitle,Image,Author,Publication,Year,Month,Day,Reading_Time,Claps,...,Tag_travel,Tag_trump,Tag_ux,Tag_venture-capital,Tag_web-design,Tag_web-development,Tag_women,Tag_wordpress,Tag_work,Tag_writing
0,Online Animation: Mixamo vs Norah AI,Online animations tools provide game designers...,1,Emma Laurent,,2017,8,1,5,12.0,...,0,0,0,0,0,0,0,0,0,0
1,A.I. ?,,0,Sanparith Marukatat,,2017,8,1,2,11.0,...,0,0,0,0,0,0,0,0,0,0


#### Cleaning Title Column

In [5]:
# Removing NaN values
medium_stories['Title'].fillna(value='',inplace=True)

In [6]:
words = set(nltk.corpus.words.words())
stuff_to_be_removed = list(stopwords.words("english"))+list(punctuation)

In [7]:
lemmatizer = WordNetLemmatizer()

In [8]:
def clean_text(sent):
    
    text = word_tokenize(sent.lower())
    text = [t for t in text if len(t) > 2]
    text = [lemmatizer.lemmatize(y) for y in text if y not in stuff_to_be_removed]
    cleaned_sent = " ".join(text)
    return cleaned_sent    

In [9]:
final_corpus = medium_stories['Title'].apply(lambda x : clean_text(x))

In [11]:
medium_stories['Title'] = final_corpus

#### Converting all tags columns into a single column consisting of list of tags
Doing so would help with converting the labels to binary values

In [12]:
medium_tags = medium_stories.iloc[:,12:]

In [13]:
cols = medium_tags.columns

for tag in cols:
    medium_tags[tag] = medium_tags[tag].apply(lambda x: tag if x == 1 else '')

In [14]:
medium_tags['tag']  = medium_tags.apply(lambda x : list(filter(None,x.values)),axis=1)

In [15]:
medium_stories.drop(medium_stories.columns[12:],axis=1,inplace=True)

In [16]:
medium_stories['tag'] = medium_tags['tag']

In [19]:
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(medium_stories['tag'])

# transform target variable
y = multilabel_binarizer.transform(medium_stories['tag'])

In [20]:
# Conversion to pkl for purpose of the reusage
pickle.dump(multilabel_binarizer,open('label_binarizer.pkl','wb'))

In [21]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=10000)

In [22]:
pickle.dump(tfidf_vectorizer,open('vectorizer.pkl','wb'))

In [23]:
X_train, X_val, y_train, y_val = train_test_split(medium_stories['Title'], y, test_size=0.2, random_state=9)

In [24]:
# create TF-IDF features
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_val_tfidf = tfidf_vectorizer.transform(X_val)

### Training the model on prediction of labels/tags

Using this model, the tag for a particular input query can be identified, using which the related set of observations can be narrowed down and computation time would be saved as we would be looking at a subset of data for sentence similarities

In [26]:
lr = LogisticRegression()
clf = OneVsRestClassifier(lr)

In [27]:
# fit model on train data

start=dt.datetime.now()

clf.fit(X_train_tfidf, y_train)

print('Elapsed Time: ',str(dt.datetime.now()-start))



Elapsed Time:  0:18:10.818064


In [28]:
pickle.dump(clf, open('finalized_model.pkl', 'wb'))

In [29]:
# make predictions for validation set

y_pred = clf.predict(X_val_tfidf)

In [30]:
# Checking inverse transform fucntion
multilabel_binarizer.inverse_transform(y_pred)[600]

('Tag_writing',)

In [31]:
# evaluate performance
f1_score(y_val, y_pred, average="micro")

0.2572089235748995

In [32]:
medium_tags['tag'].to_pickle('medium_tags_pkl.pkl')

In [33]:
medium_stories.to_pickle('medium_cleaned.pkl')

The pickle files generated here are used in the search_app.py file in the app folder