## Import Python Library
### We have imported all the relevance Library
- Pandas
- Numpy
- gensim
- nltk - Natural Language Tool Kit
- pyLDAvis
- pickle

In [128]:
import pandas as pd
import numpy as np
import os
import re

import gensim
from gensim.utils import simple_preprocess
import nltk
#nltk.download('stopwords')

import gensim.corpora as corpora
from nltk.corpus import stopwords
from pprint import pprint

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

import pickle 

import warnings
warnings.filterwarnings('ignore')

## Import Data
##### The data is extracted from the www.quora.com. The data is extracted by manually from searching the relevance of UUIC in the Computer Science Department. These documents (text extracted) are stored in the excel sheet (quora_data.xls) with 4 columns as below

- URL 
- Relevance
- Question
- Answer

##### We are interested in those questions and its related answer where "Relevance" = "Yes". These questions are related only to University of Illinois Urbana Champaign (UIUC UC) Computer Science Department, where users are raising question related to know the feedback of the department. Other questions like comparision of of University and other questions, which we have marked as "Relevance" = "No".


In [29]:
# Read the data
quora_data = pd.read_csv('quora_data.csv')

# Print head
quora_data.head()

Unnamed: 0,URL,Relevance,Question,Answer
0,https://www.quora.com/What-is-it-like-to-study...,Yes,What is it like to study computer science at U...,Background: I got my MS in CS from UIUC and ta...
1,https://www.quora.com/What-is-it-like-to-study...,Yes,What is it like to study computer science at U...,Thanks for the A2A.\n\nAs a current CS student...
2,https://www.quora.com/What-is-it-like-to-study...,Yes,What is it like to study computer science at U...,"Even graduating just a year ago, I am not sure..."
3,https://www.quora.com/How-hard-is-it-to-get-in...,Yes,How hard is it to get into computer science at...,CS has been very difficult to get in. A lot of...
4,https://www.quora.com/How-hard-is-it-to-get-in...,Yes,How hard is it to get into computer science at...,The Computer Science major at UIUC is extremel...


## Data Description & Extraction

In [89]:
#Total Number of data in the quora_data
print ("Number of rows & data in the quora_data: {}".format(quora_data.shape))

#Total Number of distinct URL for the questions
print ("Number of Distinct URL in quora_data: {}".format(len(quora_data['URL'].unique())))

#Total Number of data in the quora_data which are relevanced
quora_data_relevance = quora_data[quora_data["Relevance"] == "Yes"]
print ("Relevanced Data in quora_data (Relevance = Yes): {}".format(quora_data_relevance.shape))


#Total Number of distinct URL for the questions which are Relevanced
print ("Number of Distinct URL which are Relevance: {}".format(len(quora_data_relevance['URL'].unique())))

#Show all the Questions 
print ("-----------------------------------------------------------")
for i in np.arange(len(quora_data_relevance["Question"].unique())):
    print ("{}.{}".format(i+1,quora_data_relevance["Question"].unique()[i]))
print ("-----------------------------------------------------------")

#Show some of the answers to the questions 
for i in np.arange(5):
    print ("{}.{}".format(i+1,quora_data_relevance["Answer"].unique()[i]))
    print ("\n\n")
print ("-----------------------------------------------------------")

Number of rows & data in the quora_data: (56, 4)
Number of Distinct URL in quora_data: 12
Relevanced Data in quora_data (Relevance = Yes): (40, 4)
Number of Distinct URL which are Relevance: 9
-----------------------------------------------------------
1.What is it like to study computer science at UIUC?
2.How hard is it to get into computer science at UIUC?
3.How is life as a computer science student at UIUC's college of engineering?
4.How is the University of Illinois, Chicago for an MS in CS?
5.Why UIUC CS is so highly ranked when it is relatively easier to get into it as compared with other top CS schools?
6.How's the University of Illinois? How difficult is it to get into its CS program?
7.What's the reputation of the University of Illinois at Chicago (UIC) for CS?
8.How good is the MS in Computer Science and Engineering program of the University of Illinois at Chicago, considering the quality of education, job & internship scene and part time jobs?
9.According to the U.S. News an

## Data Cleaning
#### Following Data Cleaning are done as part of the data cleaning
- remove the special character and HTML character (".", "!", "?")
- lower the text data

In [129]:
# Remove the special characters (., !, ?, [, ])
quora_data_relevance['Answer_processed'] = quora_data_relevance['Answer'].map(lambda x: re.sub('[,\.!?]', '', x))

# Convert to the lowercase
quora_data_relevance['Answer_processed'] = quora_data_relevance['Answer_processed'].map(lambda x: x.lower())


## Data Cleaning
#### Following Data Cleaning are done as part of the data cleaning
- Remove the punctuations mark
- remove the stop words

In [130]:
stopping_words = stopwords.words('english')

#Below are some of the words which are extended other than included of the english stop words
stopping_words.extend(['uiuc','cs','uic','university','graduate',"illinois",'school',\
                       'also','know','like','get','department','take','many','would',\
                       'chicago','since','taking','go','really','one','may','even',\
                       'lot',"computer","science"])

def convert_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

def removing_stopwords(texts):
    final_data_stopwords = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
    return final_data_stopwords


text_data = quora_data_relevance.Answer_processed.values.tolist() # Convert the text into list
final_data = list(convert_words(text_data))

# after removing the stop words
final_data = removing_stopwords(final_data)

print(final_data[:1][0][:30])

['background', 'got', 'ms', 'taught', 'couple', 'undergraduate', 'courses', 'course', 'levels', 'range', 'undergraduate', 'found', 'level', 'courses', 'rigorous', 'thorough', 'right', 'mix', 'theory', 'practice', 'assignments', 'machine', 'problems', 'homes', 'exams', 'well', 'thought', 'tested', 'right', 'abilities']


In [131]:
# Create Dictionary
id_as_word = corpora.Dictionary(data_words)

corpus = [id2word.doc2bow(text) for text in final_data]

# Number of Topics
num_topics = 5

# Create the LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,id2word=id_as_word,num_topics=num_topics)
doc_lda = lda_model[corpus]

In [132]:
# Visualize the topics
pyLDAvis.enable_notebook()

LDAvis_data_filepath = os.path.join('ldavis_'+str(num_topics))

# # this is a bit time consuming - make the if statement True
# # if you want to execute visualization prep yourself
if 1 == 1:
    LDAvis_prepared = gensimvis.prepare(lda_model, corpus, id2word)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)

# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)

pyLDAvis.save_html(LDAvis_prepared, 'ldavis_'+ str(num_topics) +'.html')

LDAvis_prepared