# Machine Learning

# 1.Building Machine Learning Classifiers: Building a basic Random Forest model

# Read and Clean text

In [1]:
import nltk
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import string

stopwords=nltk.corpus.stopwords.words('english')
ps=nltk.PorterStemmer()
data=pd.read_csv('SMSSpamCollection.tsv',sep='\t')
data.columns=['label','Body_text']
#function for
#1.body length
# 2.body punctuation
def count_punct(text):
    count=sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text)-text.count(" ")),3)*100
data['Body_len']=data['Body_text'].apply(lambda x:len(x)-x.count(" "))
data["punct%"]=data["Body_text"].apply(lambda x:count_punct(x))
#cleaning the text function
def clean_text(text):
    text=="".join([word.lower() for word in text if word  not in string.punctuation])
    tokens=re.split('\W+',text)
    text=[ps.stem(word) for word in tokens if word not in stopwords]
    return text
#vectorizing
tfidf_vect=TfidfVectorizer(analyzer=clean_text)
X_tfidf=tfidf_vect.fit_transform(data['Body_text'])

X_feature=pd.concat([data['Body_len'],data["punct%"],pd.DataFrame(X_tfidf.toarray())],axis=1)
X_feature.head()

Unnamed: 0,Body_len,punct%,0,1,2,3,4,5,6,7,...,7522,7523,7524,7525,7526,7527,7528,7529,7530,7531
0,24,25.0,0.101347,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,128,4.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,39,15.4,0.079857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,49,4.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,116,6.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Exploring Random Forest classifier Attributes and  HyperParameters

In [2]:
from sklearn.ensemble import RandomForestClassifier

In [3]:
print(dir(RandomForestClassifier))
print(RandomForestClassifier())

['__abstractmethods__', '__annotations__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_abc_impl', '_check_n_features', '_estimator_type', '_get_param_names', '_get_tags', '_make_estimator', '_more_tags', '_repr_html_', '_repr_html_inner', '_repr_mimebundle_', '_required_parameters', '_set_oob_score', '_validate_X_predict', '_validate_data', '_validate_estimator', '_validate_y_class_weight', 'apply', 'decision_path', 'feature_importances_', 'fit', 'get_params', 'predict', 'predict_log_proba', 'predict_proba', 'score', 'set_params']
RandomForestClassifier()


In [4]:
#feature_importances_:- this will out puts the value of each feature to the Model
#fit:- helps to fit the model and stores the object
#predict:- this model is used to prediction by using the objects created by the fit model



# Exploring random forest classifier through Cross-validation

In [5]:
from sklearn.model_selection import KFold, cross_val_score
#kfold helps in splliting the datasets
#cross_val_score will help us to get the actuall scoring 

In [6]:
rf=RandomForestClassifier(n_jobs=-1)
#n_jobs to -1 will help us to run faster by bulding the decision tree in parallel 
k_fold=KFold(n_splits=5)
#n_splits will help in splitting the dataset into the number we assign to it.
cross_val_score(rf,X_feature,data['label'],cv=k_fold,scoring='accuracy',n_jobs=-1)

array([0.9793722 , 0.98384201, 0.98114901, 0.97396768, 0.98204668])

# 2.Building Machine Learning Classifier:- Random Forest on a hold out test set

# Exploring RandomForestClassifier through  HoldOut set

In [7]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split

In [8]:
#creating 4 datasets
x_train,x_test,y_train,y_test=train_test_split(X_feature,data['label'],test_size=0.2)
#0.2=20% is the test size

In [9]:
from sklearn.ensemble import RandomForestClassifier
rf1=RandomForestClassifier(n_estimators=50,max_depth=20,n_jobs=-1)
#max_depth will give the depth of the decision tree to 20 default value is None if u use default value it will build the decision tree as deep it can build.
rf_model=rf1.fit(x_train,y_train)
#i have explained the fit function above in cell number 12


In [10]:
sorted(zip(rf_model.feature_importances_,x_train.columns),reverse=True)[0:10]
#sort is used to sort them in order and reverse will make the default ascending order to descending order

[(0.058253796146168796, 1898),
 (0.04948181796118803, 6906),
 (0.04380202267316266, 'Body_len'),
 (0.023914451157795155, 4574),
 (0.01971188002947149, 7414),
 (0.017719719415993592, 3130),
 (0.017579632489775118, 1039),
 (0.015885625174227522, 0),
 (0.015825236745703086, 5412),
 (0.015498195226661484, 6623)]

In [11]:
y_pred=rf_model.predict(x_test)
precision,recall,fscore,support=score(y_test,y_pred,pos_label='spam',average='binary')

In [12]:
print('precision: {} / Recall:{} / Acuracy:{}'.format(round(precision,3),
                                                     round(recall,3),
                                                     round((y_pred==y_test).sum()/len(y_pred),3)))

precision: 1.0 / Recall:0.667 / Acuracy:0.956


# The meaning of the above out put is shown below Example

![](img9.png)

# 3.Building the Machine learning cassifiers: explore random Forest model with Grid-search

# Building our own grid

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split

In [15]:
x_train,x_test,y_train,y_test=train_test_split(X_feature,data['label'],test_size=0.2)

In [16]:
def train_Rf(n_est,depth):
    rf1=RandomForestClassifier(n_estimators=n_est,max_depth=depth,n_jobs=-1)
    rf_model=rf1.fit(x_train,y_train)
    y_pred=rf_model.predict(x_test)
    precision,recall,fscore,support=score(y_test,y_pred,pos_label='spam',average='binary')
    print('EST:{}/ Depth:{} /precision: {} / Recall:{} / Acuracy:{}'.format(n_est,depth,round(precision,3),
                                                     round(recall,3),
                                                     round((y_pred==y_test).sum()/len(y_pred),3)))

In [17]:
for n_est in[10,50,100]:
    for depth in [10,20,30,None]:
        train_Rf(n_est,depth)

EST:10/ Depth:10 /precision: 1.0 / Recall:0.307 / Acuracy:0.899
EST:10/ Depth:20 /precision: 1.0 / Recall:0.644 / Acuracy:0.948
EST:10/ Depth:30 /precision: 1.0 / Recall:0.706 / Acuracy:0.957
EST:10/ Depth:None /precision: 1.0 / Recall:0.761 / Acuracy:0.965
EST:50/ Depth:10 /precision: 1.0 / Recall:0.331 / Acuracy:0.902
EST:50/ Depth:20 /precision: 1.0 / Recall:0.626 / Acuracy:0.945
EST:50/ Depth:30 /precision: 1.0 / Recall:0.748 / Acuracy:0.963
EST:50/ Depth:None /precision: 1.0 / Recall:0.828 / Acuracy:0.975
EST:100/ Depth:10 /precision: 1.0 / Recall:0.294 / Acuracy:0.897
EST:100/ Depth:20 /precision: 1.0 / Recall:0.632 / Acuracy:0.946
EST:100/ Depth:30 /precision: 1.0 / Recall:0.761 / Acuracy:0.965
EST:100/ Depth:None /precision: 1.0 / Recall:0.81 / Acuracy:0.972


# Evaluating  Random Forest With Grid SearchCV

In [1]:
import nltk
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
import string

stopwords=nltk.corpus.stopwords.words('english')
ps=nltk.PorterStemmer()
data=pd.read_csv('SMSSpamCollection.tsv',sep='\t')
data.columns=['label','Body_text']
#function for
#1.body length
# 2.body punctuation
def count_punct(text):
    count=sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text)-text.count(" ")),3)*100
data['Body_len']=data['Body_text'].apply(lambda x:len(x)-x.count(" "))
data["punct%"]=data["Body_text"].apply(lambda x:count_punct(x))
#cleaning the text function
def clean_text(text):
    text=="".join([word.lower() for word in text if word  not in string.punctuation])
    tokens=re.split('\W+',text)
    text=[ps.stem(word) for word in tokens if word not in stopwords]
    return text
#vectorizing
tfidf_vect=TfidfVectorizer(analyzer=clean_text)
X_tfidf=tfidf_vect.fit_transform(data['Body_text'])
X_tfidf_feat=pd.concat([data['Body_len'],data["punct%"],pd.DataFrame(X_tfidf.toarray())],axis=1)
#count_vectorizer
count_vect=CountVectorizer(analyzer=clean_text)
x_count=count_vect.fit_transform(data['Body_text'])
x_count_feat=pd.concat([data['Body_len'],data["punct%"],pd.DataFrame(X_tfidf.toarray())],axis=1)
                                 
x_count_feat.head()

Unnamed: 0,Body_len,punct%,0,1,2,3,4,5,6,7,...,7522,7523,7524,7525,7526,7527,7528,7529,7530,7531
0,24,25.0,0.101347,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,128,4.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,39,15.4,0.079857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,49,4.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,116,6.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Exploring the parameter setting using GridSearchCV

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV


In [3]:
#fortf-idf
rf=RandomForestClassifier()
parm={'n_estimators':[10,150,300],
      "max_depth":[30,60,90,None]}
gs=GridSearchCV(rf,parm,cv=5,n_jobs=-1)
gs_fit=gs.fit(X_tfidf_feat,data['label'])
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score',ascending=False)[0:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
11,48.655709,7.855495,0.597819,0.197063,,300,"{'max_depth': None, 'n_estimators': 300}",0.979372,0.979354,0.978456,0.977558,0.979354,0.978819,0.000721,1
10,30.721205,1.732543,0.560376,0.092377,,150,"{'max_depth': None, 'n_estimators': 150}",0.978475,0.980251,0.979354,0.976661,0.978456,0.978639,0.00119,2
8,55.037477,1.018602,0.954751,0.179367,90.0,300,"{'max_depth': 90, 'n_estimators': 300}",0.978475,0.980251,0.978456,0.975763,0.979354,0.97846,0.001502,3
5,49.8342,15.848334,1.129819,0.748795,60.0,300,"{'max_depth': 60, 'n_estimators': 300}",0.979372,0.978456,0.979354,0.975763,0.979354,0.97846,0.001393,4
7,29.799014,0.701959,0.724876,0.15505,90.0,150,"{'max_depth': 90, 'n_estimators': 150}",0.977578,0.981149,0.978456,0.974865,0.979354,0.978281,0.002077,5


In [4]:
#fortf-idf
rf=RandomForestClassifier()
parm={'n_estimators':[10,150,300],
      "max_depth":[30,60,90,None]}
gs=GridSearchCV(rf,parm,cv=5,n_jobs=-1)
gs_fit=gs.fit(x_count_feat,data['label'])
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score',ascending=False)[0:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
8,54.212695,0.917309,0.860433,0.211918,90.0,300,"{'max_depth': 90, 'n_estimators': 300}",0.980269,0.979354,0.978456,0.977558,0.979354,0.978998,0.00092,1
10,29.718315,0.457558,0.615692,0.107907,,150,"{'max_depth': None, 'n_estimators': 150}",0.979372,0.982047,0.978456,0.974865,0.979354,0.978819,0.002314,2
11,49.704287,8.606638,0.66859,0.27147,,300,"{'max_depth': None, 'n_estimators': 300}",0.979372,0.981149,0.978456,0.975763,0.979354,0.978819,0.00176,2
7,28.635619,0.138297,0.494832,0.074756,90.0,150,"{'max_depth': 90, 'n_estimators': 150}",0.980269,0.981149,0.977558,0.97307,0.979354,0.97828,0.002864,4
4,24.138273,0.348198,0.495284,0.030901,60.0,150,"{'max_depth': 60, 'n_estimators': 150}",0.977578,0.979354,0.979354,0.97307,0.978456,0.977562,0.002341,5
