## Import Python Library
### We have imported all the relevance Library
- Pandas
- Numpy
- gensim
- nltk - Natural Language Tool Kit
- pyLDAvis
- pickle

In [30]:
import pandas as pd
import numpy as np
import os
import re

import gensim
from gensim.utils import simple_preprocess
import nltk
nltk.download('stopwords')

import gensim.corpora as corpora
from nltk.corpus import stopwords
from pprint import pprint

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

import pickle 

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sushanta/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Import Data
##### The data is extracted from the www.quora.com. The data is extracted by manually from searching the relevance of UUIC in the Computer Science Department. These documents (text extracted) are stored in the excel sheet (quora_data.xls) with 4 columns as below

- URL 
- Relevance
- Question
- Answer

##### We are interested in those questions and its related answer where "Relevance" = "Yes". These questions are related only to University of Illinois Urbana Champaign (UIUC UC) Computer Science Department, where users are raising question related to know the feedback of the department. Other questions like comparision of of University and other questions, which we have marked as "Relevance" = "No".


In [4]:
# Read the Quora data from the quora_data.csv file
quora_data = pd.read_csv('quora_data.csv')

# Print all the Columns
quora_data.columns

Index(['URL', 'Relevance', 'Question', 'Answer'], dtype='object')

## Data Description & Extraction

In [5]:
#Total Number of data in the quora_data
print ("Number of rows & data in the quora_data: {}".format(quora_data.shape))

#Total Number of distinct URL for the questions
print ("Number of Distinct URL in quora_data: {}".format(len(quora_data['URL'].unique())))

#Total Number of data in the quora_data which are relevanced
quora_data_relevance = quora_data[quora_data["Relevance"] == "Yes"]
print ("Relevanced Data in quora_data (Relevance = Yes): {}".format(quora_data_relevance.shape))


#Total Number of distinct URL for the questions which are Relevanced
print ("Number of Distinct URL which are Relevance: {}".format(len(quora_data_relevance['URL'].unique())))

#Show all the Questions 
print ("-----------------------------------------------------------")
for i in np.arange(len(quora_data_relevance["Question"].unique())):
    print ("{}.{}".format(i+1,quora_data_relevance["Question"].unique()[i]))
print ("-----------------------------------------------------------")

#Show some of the answers to the questions 
for i in np.arange(5):
    print ("{}.{}".format(i+1,quora_data_relevance["Answer"].unique()[i]))
    print ("\n\n")
print ("-----------------------------------------------------------")

Number of rows & data in the quora_data: (56, 4)
Number of Distinct URL in quora_data: 12
Relevanced Data in quora_data (Relevance = Yes): (40, 4)
Number of Distinct URL which are Relevance: 9
-----------------------------------------------------------
1.What is it like to study computer science at UIUC?
2.How hard is it to get into computer science at UIUC?
3.How is life as a computer science student at UIUC's college of engineering?
4.How is the University of Illinois, Chicago for an MS in CS?
5.Why UIUC CS is so highly ranked when it is relatively easier to get into it as compared with other top CS schools?
6.How's the University of Illinois? How difficult is it to get into its CS program?
7.What's the reputation of the University of Illinois at Chicago (UIC) for CS?
8.How good is the MS in Computer Science and Engineering program of the University of Illinois at Chicago, considering the quality of education, job & internship scene and part time jobs?
9.According to the U.S. News an

## Data Cleaning
#### Following Data Cleaning are done as part of the data cleaning
- remove the special character and HTML character (".", "!", "?")
- lower the text data

In [6]:
# Remove the special characters (., !, ?, [, ])
quora_data_relevance['Answer_processed'] = quora_data_relevance['Answer'].map(lambda x: re.sub('[,\.!?]', '', x))

# Convert to the lowercase
quora_data_relevance['Answer_processed'] = quora_data_relevance['Answer_processed'].map(lambda x: x.lower())


In [9]:
stopp_words = stopwords.words('english')

#Below are some of the words which are extended other than included of the english stop words
stopp_words.extend(['uiuc','cs','uic','university','graduate',"illinois",'school',\
                       'also','know','like','get','department','take','many','would',\
                       'chicago','since','taking','go','really','one','may','even',\
                       'lot',"computer","science"])

def convert_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

text_data = quora_data_relevance.Answer_processed.values.tolist() # Convert the text into list
final_data = list(convert_words(text_data))

#Removing the stop words
final_data = [[word for word in simple_preprocess(str(doc)) if word not in stopp_words] for doc in final_data]


## Create the Model and fit into LDA to generate the the Top Topics

In [15]:
# Create Dictionary
id_to_word = corpora.Dictionary(final_data)

corpus = [id_as_word.doc2bow(text) for text in final_data]

# Number of Topics
num_topics = 5

# Create the LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,id2word=id_to_word,num_topics=num_topics)
doc_lda = lda_model[corpus]

In [27]:
#Create the folder & file location where the file needs to be dumped
folder_file_location = 'ldavis_'+str(num_topics)
ldavis_file_path = os.path.join(folder_file_location)

#Create the ldavis model and dump the model in the specified folder
ldavis_model = gensimvis.prepare(lda_model, corpus, id_to_word)
with open(ldavis_file_path, 'wb') as f:
    pickle.dump(ldavis_model, f)

#Save the Model in the HTML form
pyLDAvis.save_html(ldavis_model, folder_file_location +'.html')

ldavis_model


  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
