In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

#### 0 - Data Introduction

In [2]:
url = "https://assets.datacamp.com/production/repositories/4024/datasets/f29456ea573c318fa53362fdf91871d0c7849bb2/googleplaystore.csv"

app_rating = pd.read_csv(url)
app_rating.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [3]:
app_rating['Rating'].fillna(0,inplace=True)

In [4]:
rating = app_rating['Rating']
first = app_rating.columns[0:3].to_list()
first.extend(app_rating.columns[-4:].to_list())

app_rating.drop(first,1,inplace=True)
app_rating.head()

Unnamed: 0,Reviews,Size,Installs,Type,Price,Content Rating
0,159,19M,"10,000+",Free,0,Everyone
1,967,14M,"500,000+",Free,0,Everyone
2,87510,8.7M,"5,000,000+",Free,0,Everyone
3,215644,25M,"50,000,000+",Free,0,Teen
4,967,2.8M,"100,000+",Free,0,Everyone


In [5]:
rating.head()

0    4.1
1    3.9
2    4.7
3    4.5
4    4.3
Name: Rating, dtype: float64

In [6]:
app_rating['Reviews'] = app_rating['Reviews'].str.rstrip('M')
app_rating['Reviews'] = app_rating['Reviews'].astype('float')

In [7]:
app_rating['Size'] = app_rating['Size'].str.rstrip('M')
app_rating['Size'] = app_rating['Size'].str.rstrip('k')
app_rating['Size'] = app_rating['Size'].str.rstrip('+')

app_rating['Size'].head()

0     19
1     14
2    8.7
3     25
4    2.8
Name: Size, dtype: object

In [8]:
app_rating['Size'].tail()

10836                    53
10837                   3.6
10838                   9.5
10839    Varies with device
10840                    19
Name: Size, dtype: object

In [9]:
app_rating['Size'].replace(['Varies with device','1,000'],[0,1.000],inplace=True)

In [10]:
app_rating['Size'].tail()

10836     53
10837    3.6
10838    9.5
10839      0
10840     19
Name: Size, dtype: object

In [11]:
app_rating['Size'] = app_rating['Size'].astype('float')

In [12]:
app_rating.head()

Unnamed: 0,Reviews,Size,Installs,Type,Price,Content Rating
0,159.0,19.0,"10,000+",Free,0,Everyone
1,967.0,14.0,"500,000+",Free,0,Everyone
2,87510.0,8.7,"5,000,000+",Free,0,Everyone
3,215644.0,25.0,"50,000,000+",Free,0,Teen
4,967.0,2.8,"100,000+",Free,0,Everyone


In [13]:
app_rating['Installs'] = app_rating['Installs'].str.strip('+')
app_rating['Installs'] = app_rating['Installs'].str.replace(',','')

In [14]:
app_rating['Installs'].head()

0       10000
1      500000
2     5000000
3    50000000
4      100000
Name: Installs, dtype: object

In [15]:
app_rating['Installs'].head()

0       10000
1      500000
2     5000000
3    50000000
4      100000
Name: Installs, dtype: object

In [16]:
app_rating['Installs'].tail()

10836        5000
10837         100
10838        1000
10839        1000
10840    10000000
Name: Installs, dtype: object

In [17]:
app_rating['Installs'].replace('Free',0,inplace=True)

In [18]:
app_rating['Installs'] = app_rating['Installs'].astype('int')

In [19]:
app_rating.head()

Unnamed: 0,Reviews,Size,Installs,Type,Price,Content Rating
0,159.0,19.0,10000,Free,0,Everyone
1,967.0,14.0,500000,Free,0,Everyone
2,87510.0,8.7,5000000,Free,0,Everyone
3,215644.0,25.0,50000000,Free,0,Teen
4,967.0,2.8,100000,Free,0,Everyone


In [20]:
app_rating['Type'].fillna('0',inplace=True)

In [21]:
app_rating['Type'].unique()

array(['Free', 'Paid', '0'], dtype=object)

In [22]:
le = LabelEncoder()

app_rating['Type'] = le.fit_transform(app_rating['Type'])

In [23]:
app_rating['Type'] = app_rating['Type'].astype('int')

In [24]:
app_rating.head()

Unnamed: 0,Reviews,Size,Installs,Type,Price,Content Rating
0,159.0,19.0,10000,1,0,Everyone
1,967.0,14.0,500000,1,0,Everyone
2,87510.0,8.7,5000000,1,0,Everyone
3,215644.0,25.0,50000000,1,0,Teen
4,967.0,2.8,100000,1,0,Everyone


In [25]:
app_rating['Content Rating'].unique()

array(['Everyone', 'Teen', 'Everyone 10+', 'Mature 17+',
       'Adults only 18+', 'Unrated', nan], dtype=object)

In [26]:
app_rating['Content Rating'].fillna('0',inplace=True)

In [27]:
app_rating['Content Rating'] = le.fit_transform(app_rating['Content Rating'])

In [28]:
app_rating['Content Rating'] = app_rating['Content Rating'].astype('int')

In [29]:
app_rating.head()

Unnamed: 0,Reviews,Size,Installs,Type,Price,Content Rating
0,159.0,19.0,10000,1,0,2
1,967.0,14.0,500000,1,0,2
2,87510.0,8.7,5000000,1,0,2
3,215644.0,25.0,50000000,1,0,5
4,967.0,2.8,100000,1,0,2


In [31]:
app_rating.dtypes

Reviews           float64
Size              float64
Installs            int32
Type                int32
Price              object
Content Rating      int32
dtype: object

In [32]:
rating.head()

0    4.1
1    3.9
2    4.7
3    4.5
4    4.3
Name: Rating, dtype: float64

In [33]:
app_rating['Price'] = app_rating['Price'].str.lstrip('$')
app_rating['Price'] = app_rating['Price'].str.replace('Everyone','0')
app_rating['Price'] = app_rating['Price'].astype('float')

In [34]:
app_rating.head()

Unnamed: 0,Reviews,Size,Installs,Type,Price,Content Rating
0,159.0,19.0,10000,1,0.0,2
1,967.0,14.0,500000,1,0.0,2
2,87510.0,8.7,5000000,1,0.0,2
3,215644.0,25.0,50000000,1,0.0,5
4,967.0,2.8,100000,1,0.0,2


In [35]:
app_rating.dtypes

Reviews           float64
Size              float64
Installs            int32
Type                int32
Price             float64
Content Rating      int32
dtype: object

In [36]:
X_train,X_test,y_train,y_test = train_test_split(app_rating, rating, test_size=0.2, random_state=42)

In [37]:
dtr = DecisionTreeRegressor(min_samples_leaf=3, min_samples_split=9, random_state=500)

dtr.fit(X_train, y_train)

DecisionTreeRegressor(min_samples_leaf=3, min_samples_split=9, random_state=500)

In [38]:
from sklearn.metrics import mean_absolute_error

y_pred = dtr.predict(X_test)
print('MAE: {:.3f}'.format(mean_absolute_error(y_test, y_pred)))

MAE: 0.617


#### 1 - Combining Multiple Models 

In [39]:
url_pok = "https://assets.datacamp.com/production/repositories/4024/datasets/2dd4cab3c792e2755e7dafe355a14bdb06973c5d/Pokemon.csv"

pokemon = pd.read_csv(url_pok)
pokemon.head()

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False
4,4,Charmander,Fire,,309,39,52,43,60,50,65,1,False


In [41]:
pokemon.drop(['#','Name','Type 1','Type 2'],1,inplace=True)
pokemon.head()

Unnamed: 0,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,318,45,49,49,65,65,45,1,False
1,405,60,62,63,80,80,60,1,False
2,525,80,82,83,100,100,80,1,False
3,625,80,100,123,122,120,80,1,False
4,309,39,52,43,60,50,65,1,False


In [42]:
result = pokemon['Legendary']

pokemon.drop('Legendary',1,inplace=True)
pokemon.head()

Unnamed: 0,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation
0,318,45,49,49,65,65,45,1
1,405,60,62,63,80,80,60,1
2,525,80,82,83,100,100,80,1
3,625,80,100,123,122,120,80,1
4,309,39,52,43,60,50,65,1


In [43]:
pokemon.isnull().sum()

Total         0
HP            0
Attack        0
Defense       0
Sp. Atk       0
Sp. Def       0
Speed         0
Generation    0
dtype: int64

In [44]:
result.isnull().sum()

0

In [45]:
X_train, X_test, y_train, y_test = train_test_split(pokemon, result, test_size=0.2, random_state=0)

In [51]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

In [54]:
dtc = DecisionTreeClassifier()
knn = KNeighborsClassifier()

dtc.fit(X_train, y_train)
knn.fit(X_train, y_train)

pred_dtc = dtc.predict(X_test)
pred_knn = knn.predict(X_test)

In [55]:
from sklearn.metrics import f1_score

In [56]:
a = f1_score(y_test, pred_dtc)
b = f1_score(y_test, pred_knn)

print(a)
print(b)

0.8125
0.7692307692307692


In [57]:
from sklearn.ensemble import VotingClassifier

model_vote = VotingClassifier(
    estimators=[('knn', knn), ('dtc', dtc)]
)

In [58]:
model_vote.fit(X_train, y_train)

VotingClassifier(estimators=[('knn', KNeighborsClassifier()),
                             ('dtc', DecisionTreeClassifier())])

In [59]:
vote_pred = model_vote.predict(X_test)

print(f1_score(y_test, vote_pred))

0.7692307692307692


In [60]:
from sklearn.metrics import classification_report

print(classification_report(y_test, vote_pred))

              precision    recall  f1-score   support

       False       0.97      0.99      0.98       145
        True       0.91      0.67      0.77        15

    accuracy                           0.96       160
   macro avg       0.94      0.83      0.87       160
weighted avg       0.96      0.96      0.96       160



In [61]:
url = "https://assets.datacamp.com/production/repositories/4024/datasets/02627e1959ac37b28bde9ec9d28400d776dbc123/character-predictions.csv"

got = pd.read_csv(url)
got.head()

Unnamed: 0,S.No,actual,pred,alive,plod,name,title,male,culture,dateOfBirth,...,isAliveHeir,isAliveSpouse,isMarried,isNoble,age,numDeadRelations,boolDeadRelations,isPopular,popularity,isAlive
0,1,0,0,0.054,0.946,Viserys II Targaryen,,1,,,...,0.0,,0,0,,11,1,1,0.605351,0
1,2,1,0,0.387,0.613,Walder Frey,Lord of the Crossing,1,Rivermen,208.0,...,,1.0,1,1,97.0,1,1,1,0.896321,1
2,3,1,0,0.493,0.507,Addison Hill,Ser,1,,,...,,,0,1,,0,0,0,0.267559,1
3,4,0,0,0.076,0.924,Aemma Arryn,Queen,0,,82.0,...,,0.0,1,1,23.0,0,0,0,0.183946,0
4,5,1,1,0.617,0.383,Sylva Santagar,Greenstone,0,Dornish,276.0,...,,1.0,1,1,29.0,0,0,0,0.043478,1


#### 2 - Bagging

#### 3 - Boosting

#### 4 - Stacking

#### 5 - Footnote

1. <a href="https://assets.datacamp.com/production/repositories/4024/datasets/be1aeb4c05850973c671d689575b6613fd8c8553/googleplaystore_user_reviews.csv" style="text-decoration:none; color:black">App Reviews</a>
2. <a href="https://assets.datacamp.com/production/repositories/4024/datasets/f29456ea573c318fa53362fdf91871d0c7849bb2/googleplaystore.csv" style="text-decoration:none; color:black">App Ratings</a>
3. <a href="https://assets.datacamp.com/production/repositories/4024/datasets/2dd4cab3c792e2755e7dafe355a14bdb06973c5d/Pokemon.csv" style="text-decoration:none; color:black">Pokemon</a>
4. <a href="https://assets.datacamp.com/production/repositories/4024/datasets/02627e1959ac37b28bde9ec9d28400d776dbc123/character-predictions.csv" style="text-decoration:none; color:black">Game Of Thrones</a>
5. <a href="https://assets.datacamp.com/production/repositories/4024/datasets/68204a108133375b21076bdd7cb560d4bb7ce4b8/uci-secom.csv" style="text-decoration:none; color:black">SECOM</a>
6. <a href="https://assets.datacamp.com/production/repositories/4024/datasets/f3b1b3b8ee260b447b146f156b9fbc72e51f2131/tmdb_5000_movies.csv" style="text-decoration:none; color:black">TMDb</a>