In [None]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from gensim.test.utils import get_tmpfile
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.decomposition import PCA
%matplotlib inline

In [None]:
ramen_df = pd.read_csv('../input/ramen-ratings/ramen-ratings.csv')
print(ramen_df.shape)
ramen_df.head()

## The ID is review

In [None]:
print(ramen_df['Review #'].nunique())

In [None]:
ramen_df.info()

### let's see what is top 10 means

In [None]:

ramen_df[ramen_df['Top Ten'].isnull() == False]['Top Ten'].unique()

> Top ten is given for top 10 ramens ranked over a year in years 2013-2016 (4 years * 10 =40 number of non-null values -1 because of that /n record)

> let's see if the review numbe is associated with the years

In [None]:
ramen_df[ramen_df['Top Ten'].isnull() == False][['Review #','Top Ten']].sort_values(by='Review #')

### The reiew number looks to be associated with time 

In [None]:
ramen_df['Top Ten'].fillna('not ranked',inplace=True)
ramen_df.loc[ramen_df["Top Ten"] =="\n",'Top Ten'] = 'not ranked'


In [None]:
top_ramen = pd.DataFrame()
top_ramen = ramen_df[ramen_df["Top Ten"] != "not ranked"]
top_ramen['year'] = ramen_df[ramen_df["Top Ten"] != "not ranked"]["Top Ten"].str.extract(r'([0-9]+)')
top_ramen['rank'] = ramen_df[ramen_df["Top Ten"] != "not ranked"]["Top Ten"].str.extract(r'(#[0-9]+)').replace('#',' ',regex=True)
top_ramen.drop('Top Ten',inplace =True,axis=1)
top_ramen.head()

### let's do some exploration

In [None]:
ramen_df['Stars'].unique()

In [None]:
ramen_df.dtypes

In [None]:
row = ramen_df.query('Stars != "Unrated"')['Stars'].astype('float')

In [None]:
plt.hist(row,bins=20);

> left skewed as positive ratings like 5 (the higheest frequency) and 4 are more likely to be found.

In [None]:
ramen_df.query('Stars == "Unrated"').shape[0]

> just 3 records unrated we can git rid of them 

In [None]:
ramen_df.drop(ramen_df[ramen_df['Stars']=="Unrated"].index,inplace=True)
ramen_df['Stars'] = ramen_df['Stars'].astype('float')

In [None]:
category_cols = ['Brand','Variety','Style','Country']

for i,col in enumerate(category_cols):
    print(ramen_df[col].nunique())

In [None]:
color = sns.color_palette()[0]
sns.countplot(data=ramen_df,x='Style',color=color,order = ramen_df['Style'].value_counts().index);

In [None]:
plt.figure(figsize=(12,12))
sns.countplot(data=ramen_df,y='Country',color=color,order = ramen_df['Country'].value_counts().index);


In [None]:
ramen_df['Brand'].value_counts()[ramen_df['Brand'].value_counts() >30]

In [None]:
plt.figure(figsize=(12,12))
B_count = ramen_df['Brand'].value_counts()
B_count = B_count[B_count>10]
sns.barplot(x=B_count,y=B_count.index,color=color);


### let's see the relation between countries, Brand, style with rating

In [None]:
cols = ['Country', 'Brand', 'Style']
fig,axs = plt.subplots(3,1,figsize=(10,10))
for i,col in enumerate(cols):
    top = list(ramen_df[cols[i]].value_counts()[:10].index)
    
    df= ramen_df[ramen_df[col].isin(top)]
    sns.violinplot(data=df,x=cols[i],y='Stars',ax=axs[i],order=top);
    


In [None]:
cols = ['Country', 'Brand', 'Style']
fig,axs = plt.subplots(3,1,figsize=(10,10))
for i,col in enumerate(cols):
    top = list(ramen_df[cols[i]].value_counts()[:10].index)
    
    df= ramen_df[ramen_df[col].isin(top)]
    sns.boxplot(data=df,x=cols[i],y='Stars',ax=axs[i],order=top);
    


> The ratings are high in general. For countries Japan, Malaysia, Indenosia and Singapore hs high peaks in 5 Stars rating. The Brands that has higher ratings are Nissin,Nongshim, Paldo and Indomies. 

>The style graph looks to not have effect on rating except for Box style which has high ratings (may be because the very low number of records). 

#### let's see the review number effect on the rating

In [None]:
k = ramen_df.loc[:,'Stars'].groupby(np.arange(len(ramen_df))//30).mean()
plt.plot(k)
plt.ylim(0);

> The ratings are decreasing with time

In [None]:
df = ramen_df
TopRamenCountris = ramen_df['Country'].value_counts()[:11]
df = ramen_df.query('Country in @TopRamenCountris.index')
df = df.groupby(['Country','Brand']).size().reset_index()
df = df.pivot('Brand','Country',0).fillna(0)
brands = df.sum(axis=1)[df.sum(axis=1)>50].index
sns.clustermap(df.loc[brands,:])

> Nissin is in Hong Kong, USA, Singapore  and mainly in Japan. The rest of the brands are in one or two places. USA, Japan and South Korea has more than 1 main Brand (3 main Brands).

>Indomie Brand is in indonisea mainly and indonisea has high ratings. From previous plots we see that Indomie has high ratings too.  

### Now let's take closer loak on Variety as the elements in it is nearly unique

In [None]:
variety_words =set()
for index,row in ramen_df.iterrows():
    #print(row)
    word_list = row['Variety'].split()
    variety_words.update(word_list)
len(variety_words)

> There are 1564 unique word in the varieties 

### let's try to imped them and see if the rating is affected by it

In [None]:

text = ramen_df['Variety'].unique()
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(text)]
model = Doc2Vec(documents, vector_size=6, window=2, min_count=1, workers=4)
#Persist a model to disk:


fname = get_tmpfile("my_doc2vec_model")

model.save(fname)



In [None]:

fname = get_tmpfile("my_doc2vec_model")
model = Doc2Vec.load(fname)  # you can continue training with the loaded model!
#If you’re finished training a model (=no more updates, only querying, reduce memory usage), you can do:

model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
#Infer vector for a new document:


In [None]:
for index,row in ramen_df.iterrows(): 
    ramen_df.loc[index,['1','2','3','4','5','6']] = model.infer_vector(row['Variety'].split()) 

### now visualise the impeddings using PCA 

In [None]:
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(ramen_df[['1','2','3','4','5','6']])
principalDf = pd.DataFrame(data = principalComponents
             , columns = ['principal component 1', 'principal component 2'])
principalDf['Stars'] = ramen_df['Stars']

In [None]:
plt.scatter(data=principalDf,x='principal component 1',y='principal component 2',c=principalDf['Stars'],alpha=5/10)
plt.colorbar()

### the variety doesn't play major role in seperating the ratings

### let's check if this associated with countries contribution in rating over time.

In [None]:
TopRamenCountris = ramen_df['Country'].value_counts()[:11]
TopRamenCountris

In [None]:
df = ramen_df.query('Country in @TopRamenCountris.index')
k = df.loc[:,['Country','Stars']].groupby([np.arange(len(df))//40,'Country']).mean()
k = k.reset_index()
k

In [None]:

d = k.pivot("Country", "level_0", "Stars").fillna(0)
d = d.drop(55,axis=1)
sns.clustermap(d,col_cluster=False)

In [None]:
TopRamenBrands = ramen_df['Brand'].value_counts()[:11]
df = ramen_df.query('Brand in @TopRamenBrands.index')
k = df.loc[:,['Brand','Stars']].groupby([np.arange(len(df))//40,'Brand']).mean()
k = k.reset_index()
d = k.pivot("Brand", "level_0", "Stars").fillna(0)
#d = d.drop(55,axis=1)
sns.clustermap(d,col_cluster=False)

>Some brands doesn't have reviews in early times like Mama, Luckt Me!, Vina Acecook. 
Samyang Foods doesn't have records at late times. 

>Nissin has moderately high ratings over time.
Some Brands has no records in some time intervals and some has really high ratings at early times which decreases over time like Nongshim and Paldo.

In [None]:
TopRamenBrands = ramen_df['Brand'].value_counts()[:10]
df = ramen_df.query('Brand in @TopRamenBrands.index')
k = df.loc[:,['Brand','Stars']].groupby([np.arange(len(df))//40,'Brand']).mean()
k = k.reset_index()

In [None]:
g= sns.FacetGrid(data=k,col='Brand',col_order=TopRamenBrands.index,aspect=2,col_wrap=3);

g.map(plt.plot,'level_0','Stars');

There are many flactuations but in general some Brands are increasing in ratings with time, some are decreasing and some just are maintaining their level.