In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 

In [2]:
# Overfitting - Model captures all the training data including the noises leads to overfitting
# The main problem of Decision Tree is overfitting 
# Solution to this is train many decsion trees and take the average of the results
# This is known as BootStrap Aggregation - Bagging 
# Sampling with replacement is followed
# The main dataset is divided to subsets 
# Bagging can be done with different machine learning algorithms 
# Here we are using random forest 

In [3]:
# It's supervised learning 
# Used for both regression and classification
# can handle categorical feature
# feature scaling is not required

# Using RF for Classification 

In [4]:
df = pd.read_csv('C:/Users/Palla Anuraag Sharma/Downloads/Datacamp/Datasets/Iris DataSet/Iris.csv')

In [5]:
df = df.set_index('Id')
df.sample(5,random_state=5)

Unnamed: 0_level_0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
83,5.8,2.7,3.9,1.2,Iris-versicolor
135,6.1,2.6,5.6,1.4,Iris-virginica
115,5.8,2.8,5.1,2.4,Iris-virginica
43,4.4,3.2,1.3,0.2,Iris-setosa
110,7.2,3.6,6.1,2.5,Iris-virginica


In [6]:
from sklearn.preprocessing import LabelEncoder
df['Species_cat'] = LabelEncoder().fit_transform(df.Species)

In [7]:
x = df[['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm']]
y = df['Species_cat']

In [8]:
from sklearn.model_selection import train_test_split as tts

x_train,x_test,y_train,y_test = tts(x,y,test_size=0.3,random_state=35)

In [10]:
from sklearn.ensemble import RandomForestClassifier

iris = RandomForestClassifier(n_estimators=10)
iris = iris.fit(x_train,y_train)

In [14]:
from sklearn import metrics

predicted = iris.predict(x_test)
score = 100.0 * metrics.accuracy_score(y_test, predicted)
print(f'Decision Tree Classification [Iris Data] Score = {score:4.1f}%\n')
print(f'Classification Report:\n {metrics.classification_report(y_test, predicted)}\n')

Decision Tree Classification [Iris Data] Score = 91.1%

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        16
           1       0.81      0.93      0.87        14
           2       0.92      0.80      0.86        15

    accuracy                           0.91        45
   macro avg       0.91      0.91      0.91        45
weighted avg       0.92      0.91      0.91        45




In [15]:
feature_importance = pd.DataFrame(list(zip(x_train.columns, iris.feature_importances_)), columns=['Feature', 'Importance'])
feature_importance.sort_values(by='Importance', ascending=False)

Unnamed: 0,Feature,Importance
3,PetalWidthCm,0.58799
2,PetalLengthCm,0.356398
0,SepalLengthCm,0.033648
1,SepalWidthCm,0.021964


# Regression Using Random Forest

In [16]:
df = pd.read_csv('C:/Users/Palla Anuraag Sharma/Downloads/Datacamp/Datasets/Air Bnb New York/AB_NYC_2019.csv')

In [17]:
df.sample(5,random_state=32)

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
3817,2299633,Ritz-Plaza - 2 bedroom / 2 fullbath,11747009,Troy,Manhattan,Theater District,40.76096,-73.98646,Entire home/apt,975,2,1,2015-10-09,0.02,1,0
36220,28805003,"Micro-Studio in Hells Kitchen, Shared Bath",198861577,Novo,Manhattan,Hell's Kitchen,40.76021,-73.99197,Private room,69,30,0,,,5,0
10340,7915596,"Large, Bright NYC Room Near Trains",2339049,Arianna & Z,Brooklyn,Bushwick,40.69919,-73.93818,Private room,75,2,226,2019-06-10,4.78,2,128
26333,20982862,Spacious NYC Duplex- 1 Bedroom & 2 Bathrooms,38250510,Ant,Queens,Elmhurst,40.72859,-73.87276,Entire home/apt,150,2,10,2019-01-01,0.46,1,89
35446,28128130,Astoria Cozy 2 Bedrooms Apartment,83342221,Edgard,Queens,Astoria,40.76317,-73.91124,Entire home/apt,200,5,1,2018-09-30,0.11,1,0


In [18]:
df['reviews_per_month']  = df['reviews_per_month'].fillna(df['reviews_per_month'].median())

In [19]:
df = df.drop(columns=['last_review'])

In [20]:
from sklearn.preprocessing import LabelEncoder 
le = LabelEncoder()
df['ng_cat'] = le.fit_transform(df['neighbourhood_group'])

In [21]:
df['rt_cat'] = le.fit_transform(df['room_type'])

In [30]:
y = df['price']
x = df[['ng_cat','rt_cat']]

In [31]:
from sklearn.model_selection import train_test_split as tts
x_train,x_test,y_train,y_test = tts(x,y,test_size=0.3,random_state=45)

In [32]:
from sklearn.ensemble import RandomForestRegressor

# Create Regressor with default properties
air_model = RandomForestRegressor(random_state=23)

# Fit estimator and display score
air_model = air_model.fit(x_train, y_train)
print(f'Score = {air_model.score(x_test, y_test):.1%}')

Score = 9.7%


In [33]:
from sklearn import metrics

# Regress on test data
pred = air_model.predict(x_test)

# Copute performance metrics
mae = metrics.mean_absolute_error(y_test, pred)
mse = metrics.mean_squared_error(y_test, pred)
mr2 = metrics.r2_score(y_test, pred)

# Display metrics
print(f'R^2 Score             = {mr2:5.3f}')
print(f'Mean Absolute Error   = {mae:4.2f}')
print(f'Mean Squared Error    = {mse:4.2f}')


R^2 Score             = 0.097
Mean Absolute Error   = 72.47
Mean Squared Error    = 43116.21
