In [None]:
# Import pandas
import pandas as pd

In [None]:
# Upload 'bike_rentals.csv' to dataFrame
df_bikes = pd.read_csv('bike_rentals.csv')

In [None]:
# Display first 5 rows
df_bikes.head()

In [None]:
# Show df_bikes descriptive statistics
df_bikes.describe()

In [None]:
# Display df_bikes info
df_bikes.info()

In [None]:
# Sum null values
df_bikes.isna().sum().sum()

In [None]:
# Show null value in df_bikes
df_bikes[df_bikes.isna().any(axis=1)]

In [None]:
# Fill windspeed null values with median
df_bikes['windspeed'].fillna((df_bikes['windspeed'].median()), inplace=True)

In [None]:
# Display rows 56 and 81
df_bikes.iloc[[56, 81]]

In [None]:
# Groupby season with median
df_bikes.groupby(['season']).median()

In [None]:
# Convert 'hum' null values to median of season
df_bikes['hum'] = df_bikes['hum'].fillna(df_bikes.groupby('season')['hum'].transform('median'))

In [None]:
# Show null values of 'temp' column
df_bikes[df_bikes['temp'].isna()]

In [None]:
# Compute mean temp and atemp by row
mean_temp = (df_bikes.iloc[700]['temp'] + df_bikes.iloc[702]['temp'])/2
mean_atemp = (df_bikes.iloc[700]['atemp'] + df_bikes.iloc[702]['atemp'])/2

# Replace null values with mean temperatures
df_bikes['temp'].fillna((mean_temp), inplace=True)
df_bikes['atemp'].fillna((mean_atemp), inplace=True)

In [None]:
# Convert 'dteday' to datetime object
df_bikes['dteday'] = pd.to_datetime(df_bikes['dteday'])

In [None]:
df_bikes['dteday'].apply(pd.to_datetime, infer_datetime_format=True, errors='coerce')

In [None]:
# Import datetime
import datetime as dt

In [None]:
df_bikes['mnth'] = df_bikes['dteday'].dt.month

In [None]:
# Show last 5 rows
df_bikes.tail()

In [None]:
# Change row 730, column 'yr' to 1.0
df_bikes.loc[730, 'yr'] = 1.0

In [None]:
# Show last 5 rows
df_bikes.tail()

In [None]:
# Drop 'dteday' column
df_bikes = df_bikes.drop('dteday', axis=1)

In [None]:
# Drop 'casual', 'registered' columns
df_bikes = df_bikes.drop(['casual', 'registered'], axis=1)

In [None]:
# Export 'bike_rentals_cleaned' csv file
df_bikes.to_csv('bike_rentals_cleaned.csv', index=False)

In [None]:
# Split data into X and y
X = df_bikes.iloc[:,:-1]
y = df_bikes.iloc[:,-1]

In [None]:
# Import train_test_split
from sklearn.model_selection import train_test_split

# Import Linear Regression
from sklearn.linear_model import LinearRegression

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)

In [None]:
# Silence warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Initialize LinearRegression model
lin_reg = LinearRegression()

# Fit lin_reg on training data
lin_reg.fit(X_train, y_train)

# Predict X_test using lin_reg
y_pred = lin_reg.predict(X_test)

# Import mean_squared_error
from sklearn.metrics import mean_squared_error

# Import numpy
import numpy as np

# Compute mean_squared_error as mse
mse = mean_squared_error(y_test, y_pred)

# Compute root mean squared error as rmse
rmse = np.sqrt(mse)

# Display root mean squared error
print("RMSE: %0.2f" % (rmse))

In [None]:
# Display bike rental stats
df_bikes['cnt'].describe()

In [None]:
# Import XGBRegressor
from xgboost import XGBRegressor

# Instantiate the XGBRegressor, xg_reg
xg_reg = XGBRegressor()

# Fit xg_reg to training set
xg_reg.fit(X_train, y_train)

# Predict labels of test set, y_pred
y_pred = xg_reg.predict(X_test)

# Compute the mean_squared_error, mse
mse = mean_squared_error(y_test, y_pred)

# Compute the root mean squared error, rmse
rmse = np.sqrt(mse)

# Display the root mean squared error
print("RMSE: %0.2f" % (rmse))

In [None]:
# Import cross_val_score
from sklearn.model_selection import cross_val_score

# Instantiate Linear Regression
model = LinearRegression()

# Obtain scores of cross-validation using 10 splits and mean squared error
scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=10)

# Take square root of the scores
rmse = np.sqrt(-scores)

# Display root mean squared error
print('Reg rmse:', np.round(rmse, 2))

# Display mean score
print('RMSE mean: %0.2f' % (rmse.mean()))

In [None]:
# Instantiate XGBRegressor
model = XGBRegressor(objective="reg:squarederror")

# Obtain scores of cross-validation using 10 splits and mean squared error
scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=10)

# Take square root of the scores
rmse = np.sqrt(-scores)

# Display root mean squared error
print('Reg rmse:', np.round(rmse, 2))

# Display mean score
print('RMSE mean: %0.2f' % (rmse.mean()))

# Machine Learning - Classification

In [None]:
# Upload Census dataset (adult) from UCI Machine Learning Repository
df_census = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data')

# Display first 5 rows
df_census.head()

In [None]:
# Upload Census dataset with no header
df_census = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', header=None)

# Display first 5 rows
df_census.head()

In [None]:
# Define df_census columns
df_census.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation',
                  'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 
                   'income']

# Display first 5 rows
df_census.head()

In [None]:
# Display df_census info
df_census.info()

In [None]:
# Drop education column
df_census = df_census.drop(['education'], axis=1)

In [None]:
# Convert non-numeric columns using get_dummies
df_census = pd.get_dummies(df_census)

# Display first 5 rows
df_census.head()

In [None]:
# Drop column 'income_ <=50K'
df_census = df_census.drop('income_ <=50K', axis=1)

In [None]:
# Split data into X and y
X = df_census.iloc[:,:-1]
y = df_census.iloc[:,-1]

In [None]:
# Import Logistic Regression
from sklearn.linear_model import LogisticRegression

In [None]:
# Import cross_val_score
from sklearn.model_selection import cross_val_score

# Define cross_val function with classifer and num_splits as input
def cross_val(classifier, num_splits=10):
    
    # Initialize classifier
    model = classifier

    # Obtain scores of cross-validation
    scores = cross_val_score(model, X, y, cv=num_splits)

    # Display accuracy
    print('Accuracy:', np.round(scores, 2))

    # Display mean accuracy
    print('Accuracy mean: %0.2f' % (scores.mean()))

In [None]:
# Use cross_val function to score LogisticRegression
cross_val(LogisticRegression())

In [None]:
# Import XGBoost Classifier
from xgboost import XGBClassifier

In [None]:
# Use cross_val function to score XGBoost
cross_val(XGBClassifier(n_estimators=5))