In [None]:
# **DataCamp Mock Test: Coffee Shops**

The following is a test from DataCamp as part of the Data Science Certification. 

Before starting the test I will load all required data sets provided by DataCamp. Afterwards I will perform a series of tasks. 

*The tasks are as follows:* 

1. Cleaning the data. I will work on each of the columns to create the expected dataset eitehr by replacing mising values or encoding information into the expected format. Name the cleaned file `clean_data`
1. Producing a table to show the differnce in the median number of reviews
1. Fiting a baseline using the `train.csv`
    1. Using 'validation.csv' to predict new values for `reviews` column
1. Fit a comparison model to predict the number of store review using `train.csv`
    1. Create `compare_result` dataframe with `reviews` column 

In [None]:
# import all needed libraries
import pandas as pd
import numpy as np

# import datasets saved from DataCamp
coffee = pd.read_csv('/Users/karolk/Python_Work/DataCamp/Datasets/Coffeeshops/coffee.csv')
train = pd.read_csv('/Users/karolk/Python_Work/DataCamp/Datasets/Coffeeshops/train.csv')
validation = pd.read_csv('/Users/karolk/Python_Work/DataCamp/Datasets/Coffeeshops/validation.csv')



In [None]:
# Task 1: Cleaning the data

# create a new dataframe called 'clean_data' and copy 'coffee' dataframe
clean_data = coffee.copy()

# replace missing values in 'rating' column with 0
clean_data['Rating'] = clean_data['Rating'].fillna(0)

# replace missing values in 'Reviews' column with the overall median of the column
clean_data['Reviews'] = clean_data['Reviews'].fillna(clean_data['Reviews'].median())

# replace missing values in 'Dine in option' and 'Takeout option' with 'False'
clean_data['Dine in option'] = clean_data['Dine in option'].fillna(False)
clean_data['Takeout option'] = clean_data['Takeout option'].fillna(False)

# rename the columns to Delivert option to Delivery Option, Dine in option to Dine in Option and Takeout option to Takeout Option
clean_data = clean_data.rename(columns={'Delivery option': 'Delivery Option', 'Dine in option': 'Dine in Option', 'Takeout option': 'Takeout Option'})

clean_data



In [None]:
# Task 2: Producing a table to show the differnce in the median number of reviews depending on the 'Rating' column

# create a new dataframe called 'reviews_by_rating' and group by 'Rating' column with four columns 'rating', 'med_review', 'min_review', 'max_review'.
reviews_by_rating = coffee.groupby('Rating').agg({'Reviews': ['median', 'min', 'max']})

# flatten the MultiIndex columns
reviews_by_rating.columns = ['_'.join(col).strip() for col in reviews_by_rating.columns.values]

# reset the index
reviews_by_rating = reviews_by_rating.reset_index()

# adjust the 'Reviews_median', 'Reviews_min', 'Reviews_max' columns to be 2 decimal places
reviews_by_rating['Reviews_median'] = reviews_by_rating['Reviews_median'].round(1)

# rename the columns to 'Rating', 'Med_Reviews', 'Min_Reviews', 'Max_Reviews'
reviews_by_rating = reviews_by_rating.rename(columns={'Rating': 'Rating', 'Reviews_median': 'Med_Reviews', 'Reviews_min': 'Min_Reviews', 'Reviews_max': 'Max_Reviews'})

# view the new dataframe
reviews_by_rating

In [None]:
# Task 3: Fiting a baseline using the `train.csv`. The value we will predict is the number of reviews. As a baseline I will use linear regressio model using only the 'Rating' column. 


# import libraries needed for the model
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer

# create the X_train_1, X_test_1, y_train_1, y_test variables
X_train_1 = train['Rating']
y_train_1 = train['Reviews']
X_test_1 = validation['Rating']

# impute any missing values in 'Rating' column with the median of the column
imputer = SimpleImputer(strategy='median')
X_train_1 = imputer.fit_transform(X_train_1.values.reshape(-1, 1))
X_test_1 = imputer.transform(X_test_1.values.reshape(-1, 1))

# impute the missing values in 'Reviews' column with the median of the column
y_train_1 = imputer.fit_transform(y_train_1.values.reshape(-1, 1))

# create the model
model = LinearRegression()

# fit the model
model.fit(X_train_1, y_train_1)

# predict the values
y_pred = model.predict(X_test_1)


# create a new dataframe called 'base_result' based on the 'validation' dataframe. The DF should contain the place name, rating and predicted number of reviews
base_result = validation[['Place.name', 'Rating']].copy()
base_result['Reviews'] = y_pred
# rename 'Place.name' column to 'Place name'
base_result = base_result.rename(columns={'Place.name': 'Place name'})

# adjust the 'Reviews' column to be 2 decimal places
base_result['Reviews'] = base_result['Reviews'].round(2)

# view the new dataframe
base_result


In [None]:
# Task 4: Fit a comparison model to predict the number of store review using `train.csv`. The value we will predict is the number of reviews. As a comparison model I will use linear regressio model using the 'Rating' and 'Place.type', 'Price', 'Delivery.option'.

# create the X_train_2, X_test_2, y_train_2, y_test_2 variables
X_train_2 = train[['Rating', 'Place.type', 'Price', 'Delivery.option']]
y_train_2 = train['Reviews']
X_test_2 = validation[['Rating', 'Place.type', 'Price', 'Delivery.option']]

# impute any missing values in 'Rating' column with the median of the column
X_train_2.loc[:, ['Rating']] = imputer.fit_transform(X_train_2[['Rating']])
X_test_2.loc[:, ['Rating']] = imputer.transform(X_test_2[['Rating']])

# create encoding for 'Place.type' column using label encoding
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X_train_2.loc[:, 'Place.type'] = le.fit_transform(X_train_2['Place.type'])
X_test_2.loc[:, 'Place.type'] = le.transform(X_test_2['Place.type'])

# transform the 'Price' column into numeric values replacing '$' with 1 and '$$' with 2 and so on
X_train_2.loc[:, 'Price'] = X_train_2['Price'].replace({'$': 1, '$$': 2, '$$$': 3})
X_test_2.loc[:, 'Price'] = X_test_2['Price'].replace({'$': 1, '$$': 2, '$$$': 3})

# replace any missing values in 'Delivery.option' column with 'False'
X_train_2.loc[:, 'Delivery.option'] = X_train_2['Delivery.option'].fillna(False)
X_test_2.loc[:, 'Delivery.option'] = X_test_2['Delivery.option'].fillna(False)

# replace the 'Delivery.option' column with 1 for 'True' and 0 for 'False'
X_train_2.loc[:, 'Delivery.option'] = X_train_2['Delivery.option'].replace({True: 1, False: 0})
X_test_2.loc[:, 'Delivery.option'] = X_test_2['Delivery.option'].replace({True: 1, False: 0})

# impute the missing values in 'Reviews' column with the median of the column
y_train_2 = imputer.fit_transform(y_train_2.values.reshape(-1, 1))

# create the model. I will use linear regression model again
model_2 = LinearRegression()

# fit the model
model_2.fit(X_train_2, y_train_2)

# predict the values
y_pred_2 = model_2.predict(X_test_2)

# create a new dataframe called 'compare_result' based on the 'validation' dataframe. The DF should contain the place name, rating and predicted number of reviews
compare_result = validation[['Place.name', 'Rating']].copy()
compare_result['Reviews'] = y_pred_2

# rename 'Place.name' column to 'Place name'
compare_result = compare_result.rename(columns={'Place.name': 'Place name'})

# adjust the 'Reviews' column to be 2 decimal places
compare_result['Reviews'] = compare_result['Reviews'].round(2)

# view the new dataframe
compare_result