In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive/


In [2]:
cd "/content/drive/My Drive/ML"

/content/drive/My Drive/ML


In [0]:
from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.utils import resample
from sklearn.model_selection import cross_val_score

RANDOM_SEED = 8    # Set a random seed for reproducibility!
pd.set_option("display.max_columns", 100)

In [0]:
DATA_PATH = Path.cwd().parent / "data" / "final" / "public"

train_df = pd.read_csv(
    DATA_PATH / "/content/drive/My Drive/ML/train.csv", 
    index_col="tripid"
)
test_features_df = pd.read_csv(
    DATA_PATH / "/content/drive/My Drive/ML/test.csv", 
    index_col="tripid"
)

In [0]:
labels_df = train_df[['label']]
features_df = train_df.drop('label', axis=1)

In [0]:
labels_df=labels_df.replace("correct", 1)
labels_df=labels_df.replace("incorrect", 0)

In [0]:
numeric_cols = features_df.columns[features_df.dtypes != "object"].values

In [0]:
numeric_preprocessing_steps = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('standard_scaler', StandardScaler())])

preprocessor = ColumnTransformer(
    transformers = [
        ("numeric", numeric_preprocessing_steps, numeric_cols)
    ],
    remainder = "drop"
)

In [0]:
estimator = RandomForestClassifier(n_estimators=10000)

In [0]:
full_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", estimator),
])

In [0]:
X_train, X_eval, y_train, y_eval = train_test_split(
    features_df,
    labels_df,
    test_size=0.33,
    shuffle=True,
    stratify=labels_df,
    random_state=RANDOM_SEED
)

In [14]:
%%time

# Train model
full_pipeline.fit(X_train, y_train.values.ravel())

None   # don't print out the whole pipeline representation

CPU times: user 6min, sys: 505 ms, total: 6min
Wall time: 6min 1s


In [0]:
# Predict on evaluation set
preds = full_pipeline.predict(X_eval)

In [16]:
y_preds = pd.DataFrame(
    {
        "label": preds,
    },
    index = y_eval.index
)
print("y_preds.shape:", y_preds.shape)
y_preds.head()

y_preds.shape: (5669, 1)


Unnamed: 0_level_0,label
tripid,Unnamed: 1_level_1
200328361,1
191127057,1
200136070,1
195614910,1
191488838,1


In [17]:
f1_score(y_eval, y_preds, average='macro')

0.7967226047045324

In [21]:
# retrain on full dataset
%%time 

full_pipeline.fit(features_df, labels_df.values.ravel())

None   # So we don't print out the whole pipeline representation

CPU times: user 9min 39s, sys: 427 ms, total: 9min 39s
Wall time: 9min 40s


In [0]:
preds1 = full_pipeline.predict(test_features_df)

In [0]:
submission_df = pd.read_csv(DATA_PATH / "/content/drive/My Drive/ML/sample_submission.csv", 
                            index_col="tripid")

In [0]:
# Make sure we have the rows in the same order
np.testing.assert_array_equal(test_features_df.index.values, 
                              submission_df.index.values)

In [0]:
# Save predictions to submission data frame
submission_df["prediction"] = preds1

In [0]:
submission_df.to_csv('my_submission.csv', index=True)