**Machine Learning Overview**

In [None]:
#import all packages
import pandas as pd
import matplotlib.pyplot as plt
import re
import string
import nltk
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from pylab import *
nltk.download('punkt')
nltk.download('wordnet')
import warnings
warnings.filterwarnings('ignore')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


**read the valid data**

In [None]:
#read the json data
review_data = pd.read_json('Musical_Instruments_5.json')
review_data[['reviewText','overall']].head()


Unnamed: 0,reviewText,overall
0,"Not much to write about here, but it does exac...",5
1,The product does exactly as it should and is q...,5
2,The primary job of this device is to block the...,5
3,Nice windscreen protects my MXL mic and preven...,5
4,This pop filter is great. It looks and perform...,5


**do the lemmatization, tokenization and cleaning data**

In [None]:
#do the lemmatization, tokenization and cleaning data
lemmatizer=WordNetLemmatizer()
review_data['cleaned_review_text']=review_data['reviewText'].apply(lambda x :' '.join([lemmatizer.lemmatize(word.lower()) for word in word_tokenize(re.sub(r'([^\s\w]|_)+',' ',str(x)))]))

#print the new cleaned data
review_data[['cleaned_review_text','reviewText','overall']].head()


Unnamed: 0,cleaned_review_text,reviewText,overall
0,not much to write about here but it doe exactl...,"Not much to write about here, but it does exac...",5
1,the product doe exactly a it should and is qui...,The product does exactly as it should and is q...,5
2,the primary job of this device is to block the...,The primary job of this device is to block the...,5
3,nice windscreen protects my mxl mic and preven...,Nice windscreen protects my MXL mic and preven...,5
4,this pop filter is great it look and performs ...,This pop filter is great. It looks and perform...,5


**create the TF-IDF frequency**

In [None]:
#create the TF-IDF frequency
tfidf_model=TfidfVectorizer(max_features=500)
tfidf_df=pd.DataFrame(tfidf_model.fit_transform(review_data['cleaned_review_text']).todense())
tfidf_df.columns=sorted(tfidf_model.vocabulary_)
tfidf_df.head()


Unnamed: 0,10,100,12,20,able,about,accurate,acoustic,actually,adapter,add,after,again,all,almost,already,also,although,always,am,amazing,amazon,amp,an,and,another,any,anyone,anything,are,around,at,audio,away,awesome,back,bad,bag,ball,band,...,used,using,value,very,vocal,volume,wa,want,wanted,wave,way,we,week,weight,well,went,were,what,when,where,which,while,who,why,will,with,without,wont,work,worked,worth,would,wouldnt,wrong,year,yet,you,youll,your,youre
0,0.0,0.0,0.0,0.0,0.0,0.158756,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.235722,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.117475,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.134747,0.0,0.0,0.155677,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.134249,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.129292,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.198901,0.13662,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.111728,0.0,0.0,0.0,0.0,0.0,0.184319,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.091629,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.097955,0.066964,0.0,0.0,0.09129,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.073158,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.050836,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.17473,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.165882,0.0,0.149601,0.0,0.0,0.0,0.149503,0.0,0.0,0.0,0.0,0.0,0.115105,0.0,0.0,0.0,0.0,0.081666,0.0,0.111465,0.0
3,0.0,0.0,0.0,0.0,0.338287,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.184313,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.098768,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.420188,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.215812,0.0,0.0,0.0,0.0,0.0,0.212445,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.158665,0.0,0.0,0.320765


In [None]:
#set the target data
review_data['target'] = review_data['overall'].apply(lambda x:0 if x<=4 else 1)
review_data['target'].value_counts()


1    6938
0    3323
Name: target, dtype: int64

**implement Logistic Regression**

In [None]:
#implement Logistic Regression
from sklearn.linear_model import LogisticRegression
logreg=LogisticRegression()
logreg.fit(tfidf_df,review_data['target'])
predicted_labels=logreg.predict(tfidf_df)
logreg.predict_proba(tfidf_df)[:,1]




array([0.58721164, 0.62672131, 0.53073566, ..., 0.66908551, 0.55763822,
       0.24135068])

**compare the result of classification using crosstab**

In [None]:
#compare the result of classification using crosstab
review_data['predicted_labels']=predicted_labels
pd.crosstab(review_data['target'],review_data['predicted_labels'])

predicted_labels,0,1
target,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1507,1816
1,620,6318


**implement NaiveBayes**

In [None]:
#implement NaiveBayes
from sklearn.naive_bayes import GaussianNB
nb=GaussianNB()
nb.fit(tfidf_df,review_data['target'])
predicted_labels=nb.predict(tfidf_df)
nb.predict_proba(tfidf_df)[:,1]


array([9.99003326e-01, 1.05421334e-08, 1.10486128e-07, ...,
       1.00774767e-01, 9.78390087e-08, 7.69330206e-23])

**compare the result of classification using crosstab**

In [None]:
#compare the result of classification using crosstab
review_data['predicted_labels_nb']=predicted_labels
pd.crosstab(review_data['target'],review_data['predicted_labels_nb'])


predicted_labels_nb,0,1
target,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2314,1009
1,2393,4545


**implement KNN and compare result**

In [None]:
#implement KNN and compare result
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier(n_neighbors=3)
knn.fit(tfidf_df,review_data['target'])
review_data['predicted_labels_knn']=knn.predict(tfidf_df)
pd.crosstab(review_data['target'],review_data['predicted_labels_knn'])


predicted_labels_knn,0,1
target,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2637,686
1,273,6665


**implement Linear Regression and compare result**

In [None]:
#implement Linear Regression and compare result
from sklearn.linear_model import LinearRegression
linreg=LinearRegression()
linreg.fit(tfidf_df,review_data['overall'])
linreg.coef_


array([ 1.22649447e-01,  4.40931711e-01,  3.23271023e-01, -2.33162401e-01,
        3.61114510e-01,  4.34718878e-01,  5.50682826e-01,  2.99511704e-01,
        4.62252743e-01,  5.93508646e-01,  7.20542839e-01, -3.42577396e-01,
       -2.52994615e-01,  3.81456696e-01, -6.65937244e-01, -9.47907152e-02,
        1.40799851e-01, -1.24491775e-01,  4.88601337e-01,  1.77830437e-01,
        8.10385480e-01, -4.08209372e-01,  1.11693628e-01, -3.79111146e-02,
        4.87290716e-01, -3.25573590e-01,  2.98768816e-01,  1.02962600e-01,
       -3.04871209e-01, -1.11659312e-01, -3.99392894e-02, -1.56503462e-01,
        2.53272469e-01, -7.51188824e-02,  1.01363158e+00, -1.18361969e+00,
       -1.08977857e+00,  1.79173769e-01, -5.24901116e-01,  2.45126478e-01,
        1.22103655e-01, -6.79357748e-02, -1.40192387e-03,  1.11696462e+00,
        6.23379499e-03,  1.79762994e-01, -7.09365348e-02,  7.86166962e-02,
        1.35545654e-01,  1.16308438e+00, -4.05823638e-01,  6.33110938e-02,
        5.06443704e-02,  

**find interception**

In [None]:
#find interception
linreg.intercept_


4.2321479002381475

**prediction**

In [None]:
#prediction
linreg.predict(tfidf_df)

array([4.23592256, 4.19417748, 4.16506544, ..., 4.4559338 , 4.43368875,
       4.07244581])

In [None]:
#predict overall score
review_data['predicted_score_from_linear_regression']=linreg.predict(tfidf_df)
review_data[['overall','predicted_score_from_linear_regression']].head(20)


Unnamed: 0,overall,predicted_score_from_linear_regression
0,5,4.235923
1,5,4.194177
2,5,4.165065
3,5,4.099515
4,5,4.824218
5,5,4.97074
6,5,4.488011
7,3,3.850576
8,5,4.966805
9,5,4.246506


**implement decision tree classifier**

In [None]:
#implement decision tree classifier
from sklearn import tree
dtc=tree.DecisionTreeClassifier()
dtc=dtc.fit(tfidf_df,review_data['target'])
review_data['predicted_labels_dtc']=dtc.predict(tfidf_df)

pd.crosstab(review_data['target'],review_data['predicted_labels_dtc'])



predicted_labels_dtc,0,1
target,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3322,1
1,0,6938


**implement decision tree regression**

In [None]:
#implement decision tree regression
from sklearn import tree
dtr=tree.DecisionTreeRegressor()
dtr=dtr.fit(tfidf_df,review_data['overall'])
review_data['predicted_values_dtr']=dtr.predict(tfidf_df)
review_data[['predicted_values_dtr','overall']].head(10)


Unnamed: 0,predicted_values_dtr,overall
0,5.0,5
1,5.0,5
2,5.0,5
3,5.0,5
4,5.0,5
5,5.0,5
6,5.0,5
7,3.0,3
8,5.0,5
9,5.0,5


**generic function for classier models**

In [None]:
#generic function for classier models
def clf_model(model_type,X_train,y):
  model=model_type.fit(X_train,y)
  predicted_labels=model.prdict(tfidf_df)
  return predicted_labels


**implement random forest classifier**

In [None]:
#implement random forest classifier
import sklearn
from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier(n_estimators=20,max_depth=4,max_features='sqrt',random_state=1)
review_data['predicted_labels_rfc']=clf_model(rfc,tfidf_df,review_data['target'])
pd.crosstab(review_data['target'],review_data['predicted_labels_rfc'])


**implement GBM classifier**

In [None]:
#implement GBM classifier
from sklearn.ensemble import GradientBoostingClassifier
gbc=GradientBoostingClassifier(n_estimators=2,max_depth=3,max_features='sqrt',random_state=1)
review_data['predicted_labels_gbc']=clf_model(gbc,tfidf_df,review_data['target'])
pd.crosstab(review_data['target'],review_data['predicted_labels_gbc'])


**implement XGBoost classifier**

In [None]:
#implement XGBoost classifier
from xgboost import XGBClassifier
xgb_clf=XGBClassifier(n_estimators=20,learning_rate=0.03,max_depth=5,subsample=0.6,colsample_bytree=0.6,reg_alpha=10,seed=42)
review_data['predicted_labels_xgbc']=clf_model(xgb_clf,tfidf_df,review_data['target'])
pd.crosstab(review_data['target'],review_data['predicted_labels_xgbc'])


**generic function for regression models**

In [None]:
def reg_model(model_type,X_train,y):
  model=model_type.fit(X_train,y)
  predicted_values=model.predict(tfidf_df)
  return predicted_values


**implement Random Forest Regressor**

In [None]:
#implement Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor
rfg=RandomForestRegressor(n_estimators=20,max_depth=4,max_features='sqrt',random_state=1)
review_data['predicted_values_rfg']=reg_model(rfg,tfidf_df,review_data['overall'])
review_data[['overall','predicted_values_rfg']].head(10)


**implement GradientBoostingRegressor**

In [None]:
#implement GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingRegressor
gbr=GradientBoostingRegressor(n_estimators=20,max_depth=4,max_features='sqrt',random_state=1)
review_data['predicted_values_gbr']=reg_model(gbr,tfidf_df,review_data['overall'])
review_data[['overall','predicted_values_rfg']].head(10)


**implement XGBRegressor**

In [None]:
#implement XGBRegressor
from xgboost import XGBRegressor
xgbr=XGBRegressor(n_estimators=20,learning_rate=0.03,max_depth=5,subsample=0.6,colsample_bytree=0.6,reg_alpha=10,seed=42)
review_data['predicted_values_xgbr']=reg_model(xgbr,tfidf_df,review_data['overall'])
review_data[['overall','predicted_values_xgbr']].head(10)


**sampling**

In [None]:
#sampling
import pandas as pd
data=pd.read_csv('cancerdata.csv')
data.head()


**simple sampling**

In [None]:
#simple sampling
data_sample_random=data.sample(frac=0.1,random_state=42) #selecting 10% sample randomly
data_sample_random.head()


**stratified sampling**

In [None]:
#stratified sampling
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(data,data['diagnosis'],test_size=0.2,random_state=42,stratify=data['diagnosis'])
data.shape  #shape of the entire data



In [None]:
X_train.shape    #shape of training model
#Multi-stage sampling (based on diagnosis find area_mean)
data_ms=data[data['diagnosis'].isin(['M'])]
data_ms_q2=data_ms[data_ms['area_mean']>=500]
data_ms_sample=data_ms_q2.sample(frac=0.02,random_state=42)
data_ms_sample.head()


**unsupervised learning**

**implement hierarchical clustering**

In [None]:

#unsupervised learning
#implement hierarchical clustering
from sklearn.datasets import fetch_20newsgroups
import scipy
from scipy.cluster.hierarchy import dendrogram
import matplotlib as mpl
from scipy.cluster.hierarchy import fcluster
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import string
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from pylab import *
import warnings
warnings.filterwarnings('ignore')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')



**remove the stopwords**

In [None]:
#remove the stopwords
stop_words=stopwords.words('english')
stop_words=stop_words+list(string.printable)
lemmatizer=WordNetLemmatizer()


**specify the categories of the article**

In [None]:
#specify the categories of the article
categories=['misc.forsale','sci.electronics','talk.religion.misc']


**fetch the dataset and view first five articles**

In [None]:
#fetch the dataset and view first five articles
news_data=fetch_20newsgroups(subset='train',categories=categories,shuffle=True,random_state=42,download_if_missing=True)
news_data['data'][:5]


**check the categories and its nama**

In [None]:
#check the categories and its nama
news_data.target


In [None]:
news_data.target_names

**News data and corresponding categories in a Pandas dataframe**

In [None]:
#News data and corresponding categories in a Pandas dataframe
news_data_df=pd.DataFrame({'text':news_data['data'],'category':news_data.target})
news_data_df.head()


**Count the number of occurrences of each category**

In [None]:
#Count the number of occurrences of each category
news_data_df['category'].value_counts()


**cleaning the text such as tokenization, lemmatization etc**

In [None]:
#cleaning the text such as tokenization, lemmatization etc
news_data_df['cleaned_text']=news_data_df['text'].apply(lambda x :''.join([lemmatizer.lemmatize(word.lower()) for word in word_tokenize(re.sub(r'([^\s\w]|_)+','',str(x))) if word.lower() not in stop_words]))


**create a TF-IDF matrix**

In [None]:
#create a TF-IDF matrix
tfidf_model=TfidfVectorizer(max_features=200)
tfidf_df=pd.DataFrame(tfidf_model.fit_transform(news_data_df['cleaned_text']).todense())
tfidf_df.columns=sorted(tfidf_model.vocabulary_)
tfidf_df.head()


**create a distance matrix by subtracting the cosine similarities of the tf-idf representation of documents from number 1. The ward function is used to create a linkage matrix used in hierarchical clustering.**

In [None]:
#create a distance matrix by subtracting the cosine similarities of the tf-idf representation of documents from number 1. The ward function is used to create a linkage matrix used in hierarchical clustering.
dist=1-cosine_similarity(tfidf_df)
linkage_matrix=ward(dist)


**truncate the dendrogram to keep last four clusters. Leaf_rotation denotes the angle by which leaf levels must be rotated. Leaf_font_size denotes the font size of leaf labels**

In [None]:
#truncate the dendrogram to keep last four clusters. Leaf_rotation denotes the angle by which leaf levels must be rotated. Leaf_font_size denotes the font size of leaf labels
plt.title('hierarchical clustering')
plt.xlabel('clustered document')
plt.ylabel('distance')
#showing only last p clusters after merging, p is the number of cluster that should remain after merging
dendrogram(linkage_matrix,truncate_mode='lastp',p=4,leaf_rotation=90,leaf_font_size=12)
plt.show()



**fcluster() function to obtain cluster labels of clusters obtained by hierarchical clustering**

In [None]:
#fcluster() function to obtain cluster labels of clusters obtained by hierarchical clustering
k=4
clusters=fcluster(linkage_matrix,k,criterion='maxclust')
clusters


**crosstab to compare the clusters of actual and expected categories**

In [None]:
#crosstab to compare the clusters of actual and expected categories
news_data_df['obtained_clusters']=clusters
pd.crosstab(news_data_df['category'].replace({0:'misc.forsale',1:'sci.electronics',2:'talk.religion.mis'}),news_data_df['obtained_clusters'].replace({1:'cluster 1',2:'cluster 2',3:'cluster_3',4:'cluster_4'}))



**implement K-means Clustering**

In [None]:
#implement K-means Clustering
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
import scipy
import numpy as np
import matplotlib.pyplot as plt
import re
import string
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from pylab import *
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns
sns.set()
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
from scipy.spatial.distance import cdist
from sklearn.cluster import KMeans



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


**remove the stopwords**

In [None]:
#remove the stopwords
stop_words=stopwords.words('english')
stop_words=stop_words+list(string.printable)
lemmatizer=WordNetLemmatizer()



**specify the categories of the article**

In [None]:
#specify the categories of the article
categories=['misc.forsale','sci.electronics','talk.religion.misc']



**fetch the dataset and view first five articles**

In [None]:
#fetch the dataset and view first five articles
news_data=fetch_20newsgroups(subset='train',categories=categories,shuffle=True,random_state=42,download_if_missing=True)
news_data['data'][:5]


**check the categories and its name**


In [None]:
#check the categories and its name
news_data.target


In [None]:
news_data.target_names

**News data and corresponding categories in a Pandas dataframe**

In [None]:
#News data and corresponding categories in a Pandas dataframe
news_data_df=pd.DataFrame({'text':news_data['data'],'category':news_data.target})
news_data_df.head()



**Count the number of occurrences of each category**

In [None]:
#Count the number of occurrences of each category
news_data_df['category'].value_counts()


**cleaning the text such as tokenization, lemmatization etc**

In [None]:
#cleaning the text such as tokenization, lemmatization etc
news_data_df['cleaned_text']=news_data_df['text'].apply(lambda x :''.join([lemmatizer.lemmatize(word.lower()) for word in word_tokenize(re.sub(r'([^\s\w]|_)+','',str(x))) if word.lower() not in stop_words]))



**create a TF-IDF matrix**

In [None]:
#create a TF-IDF matrix
tfidf_model=TfidfVectorizer(max_features=200)
tfidf_df=pd.DataFrame(tfidf_model.fit_transform(news_data_df['cleaned_text']).todense())
tfidf_df.columns=sorted(tfidf_model.vocabulary_)
tfidf_df.head()


**k-means cluster**

In [None]:
#k-means cluster
kmeans=KMeans(n_clusters=4)
kmeans.fit(tfidf_df)
y_kmeans=kmeans.predict(tfidf_df)
news_data_df['obtained_clusters']=y_kmeans


**crosstab to compare the clusters of actual and expected categories**

In [None]:
#crosstab to compare the clusters of actual and expected categories
news_data_df['obtained_clusters']=clusters
pd.crosstab(news_data_df['category'].replace({0:'misc.forsale',1:'sci.electronics',2:'talk.religion.mis'}),news_data_df['obtained_clusters'].replace({1:'cluster 1',2:'cluster 2',3:'cluster_3',4:'cluster_4'}))


**obtain the optimal value value of k, ranging from 1 to 6.for each value of k, we store the distortion, that is the mean of distances of documents from their nearest cluster center. The value of k where the slope of the plot changes rapidly.**

In [None]:
#obtain the optimal value value of k, ranging from 1 to 6.for each value of k, we store the distortion, that is the mean of distances of documents from their nearest cluster center. The value of k where the slope of the plot changes rapidly.
distortions=[]
K=range(1,6)
for k in K:
  kmeanModel=KMeans(n_clusters=k)
  kmeanModel.fit(tfidf_df)
  distortions.append(sum(np.min(cdist(tfidf_df,kmeanModel.cluster_centers_,'euclidean'),axis=1))/tfidf_df.shape[0])
plt.plot(K,distortions,'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('Elbow Methos for optimal clusters')
plt.show()
