# Latent Dirichlet Allocation - Could be a good feature? 
## NMF(Negative Matrix Factorization) follows a similar approach but is found to perform well on smaller datasets.
# We don't do "small datasets" :P

In [48]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import numpy as np

In [6]:
data = pd.read_table("E:/Yelp/Unfiltered Data/YelpZip/Customs/SAMPLE", header=None, sep=",")
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,0,0,5051,2014-12-05,Extremely slow kitchen went hour eat shift tol...,0,1.0,1
1,1,1,5071,2014-05-20,AWFUL half hour kill bypassed Capogiro go read...,0,1.0,1
2,2,2,5071,2014-05-20,Lovely spot Especially eat outside Food excell...,0,1.0,1
3,3,3,5076,2014-03-12,Sunday ordered soy caramel latte caramel vanil...,0,1.0,1
4,4,4,5077,2014-02-20,Toast youre killing Ive tried many times like ...,0,1.0,1


In [13]:
data.columns = ['text', 'rating']
data.head()

Unnamed: 0,text,rating
0,Extremely slow kitchen went hour eat shift tol...,1
1,AWFUL half hour kill bypassed Capogiro go read...,1
2,Lovely spot Especially eat outside Food excell...,1
3,Sunday ordered soy caramel latte caramel vanil...,1
4,Toast youre killing Ive tried many times like ...,1


In [23]:
#We noticed that the text column was of the object type which the CountVectorizer can't work on
data['text'] = data['text'].apply(lambda txt: str(txt))

In [22]:
# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vec = CountVectorizer(max_df=0.90, min_df=10, max_features=1000, stop_words='english')
tf = tf_vec.fit_transform(data['text'])
tf_features = tf_vec.get_feature_names()

In [24]:
from sklearn.decomposition import NMF, LatentDirichletAllocation
lda = LatentDirichletAllocation(n_topics=10, learning_method='online', learning_offset=50,random_state=0).fit(tf)



In [30]:
def topics(model, features, top_words):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic %d:" % (topic_idx))
        print (" ".join([features[i] for i in topic.argsort()[:-top_words - 1:-1]]))

topics(lda, tf_features, 10)

Topic 0:
bar beer bartender drinks loud music beers ribs world certainly
Topic 1:
star used food time called location years phone woman cake
Topic 2:
food good like chicken place ordered sauce dish restaurant tasted
Topic 3:
horrible money sandwich service food dry worst customer disgusting counter
Topic 4:
place food great good staff service restaurant love nice mediocre
Topic 5:
pizza burger fries cheese order ordered delivery burgers good pie
Topic 6:
food service good brunch coffee terrible slow wait breakfast price
Topic 7:
steak cheese philly better genos meat pats cheesesteak street english
Topic 8:
place like dont food good really im people know time
Topic 9:
table minutes said asked told came restaurant food order time


In [46]:
train = data.iloc[:10000,]
test = data.iloc[10000:20000,]

In [71]:
# vectorizer the features
tf_vectorizer = TfidfVectorizer(max_features=7)
X_train = tf_vectorizer.fit_transform(train)

# train the model
lda = LatentDirichletAllocation(n_topics=5)
lda.fit(X_train)

# predict topics for test data
# unnormalized doc-topic distribution
X_test = tf_vectorizer.transform(test)
doc_topic_dist_unnormalized = np.matrix(lda.transform(X_test))

# normalize the distribution (only needed if you want to work with the probabilities)
doc_topic_dist = doc_topic_dist_unnormalized/doc_topic_dist_unnormalized.sum(axis=1)



# Alright! SKLearn is getting too confusing here! 