# 0. Installation (one time job)

In [4]:
#!pip install scikit-learn==0.23.1

In [5]:
 #xlrd no longer support xlsx - https://stackoverflow.com/questions/65254535/xlrd-biffh-xlrderror-excel-xlsx-file-not-supported
#!pip install openpyxl

# 1. Import Library

In [6]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from TextPreprocessing import text_preprocessing

# 2. Check Data

In [7]:
# Read data
pd.set_option('display.max_columns', None)

# data = pd.read_excel('./ASD FAQ KB v001.xlsx', sheet_name='FAQ')
data = pd.read_excel('./ASD FAQ KB v001.xlsx', sheet_name='FAQ', engine='openpyxl')

data.head()

Unnamed: 0,sn,Question,Long_Answer,Short_Answer,Source,Remarks
0,1,What are Autism Spectrum Disorders (ASD)?,ASD refers to a wide spectrum of neurodevelopm...,,http://birchtreecenter.org/learn/autism,
1,2,How common is autism?,According to a 2020 report commissioned by the...,,http://birchtreecenter.org/learn/autism,
2,3,What causes autism? Can it be cured?,The causes of this complex disorder remain unc...,,http://birchtreecenter.org/learn/autism,
3,4,Why doesn’t intervention center refer to its s...,Our students are children or youth who are cha...,,http://birchtreecenter.org/learn/autism,
4,5,What are the types of Autism Spectrum Disorders?,Autistic Disorder; Asperger Syndrome; Pervasiv...,,http://dhss.alaska.gov/dph/wcfh/Pages/autism/s...,


In [8]:
print(data.shape)

(226, 6)


# 3. Data Preprocessing

In [9]:
# Select long_answer from the data
long_answer = data.get('Long_Answer')

# Preprocess the long_answer
long_answer = long_answer.map(lambda x: ' '.join(text_preprocessing(x)))

# Vectorize the answers (one-hot)
sparse_vectorizer = CountVectorizer(strip_accents = 'unicode')
sparse_vectors = sparse_vectorizer.fit_transform(long_answer)

# (226, 2753)
print(sparse_vectors.shape)

(226, 2753)


# 4. Build Topic Model using LDA

In [11]:
# Your super power to define number of topics
n_topics = 4

# Run LDA to generate topics/clusters
lda = LatentDirichletAllocation(n_components=n_topics, max_iter=1000,
                                learning_method='online',
                                random_state=0)

lda.fit(sparse_vectors)

LatentDirichletAllocation(learning_method='online', max_iter=1000,
                          n_components=4, random_state=0)

# 5. Display the resulting topics/clusters of ASD FAQ's Long_Answer field

In [13]:
# Print the top-n key words
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [14]:
# Show the first n_top_words key words
n_top_words = 10
feature_names = sparse_vectorizer.get_feature_names()

for i, topic in enumerate(lda.components_):
    print('Topic {num}'.format(num=i+1))
    print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

Topic 1
autism disorder asd child may spectrum people behavior social cause

Topic 2
child autism may treatment intervention diagnosis early help parent therapy

Topic 3
institute national autism tel information health disorder fax behavior md

Topic 4
ability assessment behaviour concern specific characteristic academic ot memory diet



In [14]:
print("1st document(long FAQ answer) belongs to Topic",lda.transform(sparse_vectors[0]).argmax()+1)

1st document(long FAQ answer) belongs to Topic 1


In [16]:
# Display topics of first 20 long FAQ answers:
for i in range(80, 100):
    print("Document (long FAQ answer)", i+1, "belongs to Topic", lda.transform(sparse_vectors[i]).argmax()+1)

Document (long FAQ answer) 81 belongs to Topic 2
Document (long FAQ answer) 82 belongs to Topic 2
Document (long FAQ answer) 83 belongs to Topic 2
Document (long FAQ answer) 84 belongs to Topic 1
Document (long FAQ answer) 85 belongs to Topic 1
Document (long FAQ answer) 86 belongs to Topic 1
Document (long FAQ answer) 87 belongs to Topic 1
Document (long FAQ answer) 88 belongs to Topic 1
Document (long FAQ answer) 89 belongs to Topic 1
Document (long FAQ answer) 90 belongs to Topic 1
Document (long FAQ answer) 91 belongs to Topic 1
Document (long FAQ answer) 92 belongs to Topic 1
Document (long FAQ answer) 93 belongs to Topic 1
Document (long FAQ answer) 94 belongs to Topic 1
Document (long FAQ answer) 95 belongs to Topic 1
Document (long FAQ answer) 96 belongs to Topic 1
Document (long FAQ answer) 97 belongs to Topic 1
Document (long FAQ answer) 98 belongs to Topic 2
Document (long FAQ answer) 99 belongs to Topic 1
Document (long FAQ answer) 100 belongs to Topic 1


# 6. Interpret the identified topics (using top 10 words)

### Topic 1 is about: 
Defination of ASD

### Topic 2 is about: 
Treatment of ASD

### Topic 3 is about: 
Institutes/Research of ASD research

### Topic 4 is about: 
Aassessment/Symptom of ASD


# 7. Food for thought: What about the clusters/topics of "Questons"?

In [16]:
# Read data
pd.set_option('display.max_columns', None)

# data = pd.read_excel('./ASD FAQ KB v001.xlsx', sheet_name='FAQ')
data = pd.read_excel('./ASD FAQ KB v001.xlsx', sheet_name='FAQ', engine='openpyxl')

# Select long_answer from the data
question = data.get('Question')

# Preprocess the long_answer
question = long_answer.map(lambda x: ' '.join(text_preprocessing(x)))

# Vectorize the answers (one-hot)
sparse_vectorizer = CountVectorizer(strip_accents = 'unicode')
sparse_vectors = sparse_vectorizer.fit_transform(question)

# Your super power to define number of topics
n_topics = 6

# Run LDA to generate topics/clusters
lda = LatentDirichletAllocation(n_components=n_topics, max_iter=1000,
                                learning_method='online',
                                random_state=0)

lda.fit(sparse_vectors)

n_top_words = 10
feature_names = sparse_vectorizer.get_feature_names()

for i, topic in enumerate(lda.components_):
    print('Topic {num}'.format(num=i+1))
    print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

Topic 1
vaccine autism thimerosal evidence link concern mmr methylmercury research study

Topic 2
child may autism behavior asd social treatment symptom parent skill

Topic 3
autism disorder asd genetic cause brain research child disease study

Topic 4
disorder autism child spectrum asd diagnosis social asperger syndrome developmental



---
`The end is called the new start.` --- ISS : **I** **S**(elf) **S**(tudy)