In [5]:
# installing 1.0.3 because this version of pandas supports write to s3
!pip install pandas==1.0.3

You should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade pip' command.[0m


In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [2]:
def select_data(data_frame):
    '''Selects certain features from raw GCR records.'''
    df_selection = data_frame.loc[:,['line_id',"carrier","plan_name",'total_kb',"upgrade"]]
    return df_selection

def get_usage_summary(df):
    '''Returns aggregate data usage for a line_id.'''
    data_kb= df.loc[:,['line_id', "total_kb"]]
    data_kb = data_kb.groupby('line_id')['total_kb'].sum().reset_index()
    data_kb.columns=['line_id','sum_total_kb']
    return data_kb

def preprocess_customer_info(df):
    '''Imputes null values in carrier and plan_name column with most frequent value.'''
    customer_info=df.loc[:,['line_id','carrier','plan_name']].drop_duplicates()
    customer_info['carrier'].fillna(customer_info['carrier'].mode()[0], inplace=True)
    customer_info['plan_name'].fillna(customer_info['plan_name'].mode()[0], inplace=True)
    customer_info=pd.get_dummies(customer_info,columns=['carrier','plan_name'],drop_first=True)
    return customer_info

def train_model(train_X,train_Y):
    '''Given a preprocessed training dataset, trains a simple logistic regression model and 
    returns the trained model object'''
    lr_recipe = LogisticRegression(fit_intercept=True,class_weight='balanced')
    lr_model = lr_recipe.fit(train_X,train_Y)
    return lr_model

def evaluate_model(model, test_X,test_Y):
    '''Given a model and preprocessed test dataset, returns the f1 value'''
    y_pred=model.predict(test_X)
    f1 =f1_score(test_Y, y_pred)
    return round(f1,3)

def main(data_path, random_seed, test_ratio=0.2):
    '''The end to end model pipeline'''
    df =pd.read_csv(data_path,low_memory=True)    
    df_selected = select_data(df)
    usage_summary=get_usage_summary(df_selected)
    customer_info=preprocess_customer_info(df_selected)
    line_ids=df_selected.loc[:,['line_id','upgrade']].drop_duplicates().reset_index(drop=True)
    df_preprocessed=pd.merge(line_ids,usage_summary,on='line_id',how='inner')
    df_preprocessed=pd.merge(df_preprocessed,customer_info,on='line_id')
    df_Y=df_preprocessed['upgrade'].replace({'yes':1,'no':0})
    df_X=df_preprocessed.drop(columns=['line_id','upgrade'])
    X_train, X_test, Y_train, Y_test = train_test_split(df_X, df_Y, test_size=test_ratio, random_state=random_seed)
    num_train = len(X_train)
    num_test = len(X_test)
    print(f"Train has {num_train}")
    print(f"Test has {num_test}")
    model = train_model(X_train,Y_train)
    f1 = evaluate_model(model, X_test,Y_test)
    return model,f1

def make_predictions(model,eval_data_path,submission_path):
    '''Given a model, eval data path and submission path, makes predictions and 
    saves the submissions to submission path.'''
    eval_data = pd.read_csv(data_path,low_memory=True)    
    df_selected = select_data(eval_data)
    usage_summary=get_usage_summary(df_selected)
    customer_info=preprocess_customer_info(df_selected)
    line_ids=df_selected.loc[:,['line_id']].drop_duplicates().reset_index(drop=True)
    df_preprocessed=pd.merge(line_ids,usage_summary,on='line_id',how='inner')
    df_preprocessed=pd.merge(df_preprocessed,customer_info,on='line_id')
    eval_X=df_preprocessed.drop(columns=['line_id'])
    predictions=pd.DataFrame(line_ids,columns=['line_id'])
    predictions['prediction']=model.predict(eval_X)
    predictions.to_csv(submission_path,header=True,index=None)
    print(f"submission saved to {submission_path}")

In [3]:
# This path will be active after the launch of the hackathon
teamname = 'trachack-a-groups-admin-py-tracfone'
data_folder='s3://tf-trachack-data/212/'
# change root_folder to your team's root folder
# s3://tf-trachack-notebooks/<this should be replaced by team name as provided in EMAIL>/jupyter/jovyan/
root_folder='s3://tf-trachack-notebooks/'+teamname+'/jupyter/jovyan/'
data_path = root_folder+'sample-notebook/dev-sample.csv'
seed = 123
model,f1 = main(data_path, seed)
print(f"f1-score: {f1}")

Train has 15256
Test has 3815
f1-score: 0.055


In [4]:
eval_data_path=root_folder+'sample-notebook/eval-sample.csv'
submission_path=root_folder+"submission/2021-04-05.csv"
make_predictions(model,eval_data_path,submission_path)

submission saved to s3://tf-trachack-notebooks/trachack-a-groups-admin-py-tracfone/jupyter/jovyan/submission/test-submission.csv
