In [1]:
import pandas as pd
import datetime
import polars as pl

In [2]:
def read_data(path, sep=None):
    if sep is not None:
        df = pd.read_csv(path, sep=sep)
    else:
        df = pd.read_csv(path)
    return df

In [3]:
train_df = read_data('/home/jupyter/datasphere/project/train/train.csv')
test_df_source = read_data('/home/jupyter/datasphere/project/test/test.csv')

In [4]:
test_df = test_df_source.copy()

In [5]:
train_df['quarter'] = train_df['quarter'].apply(lambda x: int(x[-1]))
test_df['quarter'] = test_df['quarter'].apply(lambda x: int(x[-1]))

In [15]:
from tqdm import tqdm

def get_previous_year_q(year, quarter):
    previous_year = year
    previous_quarter = quarter - 1
    if previous_quarter == 0:
        previous_year -= 1
        previous_quarter = 4
        
    return previous_year, previous_quarter

def get_dataset(df, start_year=1990, end_year=2023, test=False):
    df = pl.from_pandas(df)
    result = []
    num_features = ['balance', 
                    'oprtn_sum_per_qrtr',
                    'oprtn_sum_per_year',
                    'frst_pmnt',
                    'pmnts_sum',
                    'pmnts_nmbr_per_qrtr',
                    'incm_sum',
                    'incm_per_qrtr']
    
    for target_year in tqdm(range(start_year, end_year)):
        for target_quarter in range(1, 5):
            # if test and target_quarter > 1:
            #     continue
            target_df = df.filter((pl.col("year") == target_year) & (pl.col("quarter") == target_quarter))
            if target_df.shape[0] == 0:
                continue
            previous_year, previous_quarter = get_previous_year_q(target_year, target_quarter)
            
            previous_df = df.filter((pl.col("year") == previous_year) & (pl.col("quarter") == previous_quarter))

            target = target_df.select(["npo_account_id", "year", "quarter", "churn"])
            
            previous_df = previous_df.drop(['churn', 'year', 'quarter'])
            joined_df = target.join(previous_df, on="npo_account_id", how="left")
            
            result.append(joined_df)
            
    return result

def process_data(df):
    to_drop = ['region', 'client_id', 'npo_account_id', 'slctn_nmbr']
    drop_on_date = ['lst_pmnt_date_per_qrtr', 'frst_pmnt_date', 'oprtn_sum_per_year']

    df.drop(to_drop, axis=1, inplace=True)

    for column in drop_on_date:
        df[column + '_year'] = pd.to_datetime(df[column]).dt.year
        df[column + '_month'] = pd.to_datetime(df[column]).dt.month
        df[column + '_day'] = pd.to_datetime(df[column]).dt.day

    df.drop(drop_on_date, axis=1, inplace=True)
    
    return df

In [None]:
train_df = pl.concat(get_dataset(train_df, 2004, 2023)).to_pandas()
train_df = process_data(train_df)
test_df = process_data(test_df)

In [18]:
from catboost import CatBoostClassifier

X_train, y_train = train_df.drop('churn', axis=1), train_df['churn']

model = CatBoostClassifier(iterations=500,
                            learning_rate=0.01,
                            depth=4,
                            verbose=300,
                            random_seed=42,
                            )

In [19]:
model.fit(X_train, y_train)

0:	learn: 0.3551523	total: 306ms	remaining: 3m 57s
100:	learn: 0.3733589	total: 28.8s	remaining: 3m 13s
200:	learn: 0.3780617	total: 57.8s	remaining: 2m 45s
300:	learn: 0.3877631	total: 1m 24s	remaining: 2m 14s
400:	learn: 0.3928413	total: 1m 51s	remaining: 1m 44s
500:	learn: 0.3970476	total: 2m 18s	remaining: 1m 16s
600:	learn: 0.4028531	total: 2m 45s	remaining: 48.5s
700:	learn: 0.4074003	total: 3m 12s	remaining: 20.9s
776:	learn: 0.4114292	total: 3m 32s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7f5ad86d1180>

In [60]:
from tqdm.auto import tqdm

def get_previous_year_q(year, quarter):
    previous_year = year
    previous_quarter = quarter - 1
    if previous_quarter == 0:
        previous_year -= 1
        previous_quarter = 4
        
    return previous_year, previous_quarter

def get_dataset(df, start_year, end_year, test=False):
    result = []
    num_features = ['balance', 
                    'oprtn_sum_per_qrtr',
                    'oprtn_sum_per_year',
                    'frst_pmnt',
                    'pmnts_sum',
                    'pmnts_nmbr_per_qrtr',
                    'incm_sum',
                    'incm_per_qrtr']
    
    for target_year in tqdm(range(start_year, end_year)):
        for target_quarter in range(1, 5):
            # if test and target_quarter > 1:
            #     continue
            target_df = df.filter((pl.col("year") == target_year) & (pl.col("quarter") == target_quarter))
            if target_df.shape[0] == 0:
                continue
            previous_year, previous_quarter = get_previous_year_q(target_year, target_quarter)
            
            previous_df = df.filter((pl.col("year") == previous_year) & (pl.col("quarter") == previous_quarter))

            target = target_df.select(["npo_account_id", "year", "quarter", "churn"])
            
            previous_df = previous_df.drop(['churn', 'year', 'quarter'])
            joined_df = target.join(previous_df, on="npo_account_id", how="left")
            
            result.append(joined_df)
            
    return result

In [61]:
train_dataset = get_dataset(df, start_year=1990, end_year=2023)

100%|██████████| 33/33 [00:04<00:00,  7.41it/s]


In [62]:
test_dataset = df_test.to_pandas() #get_dataset(df, start_year=1990, end_year=2023, test=True)

In [63]:
train_dataset = pl.concat(train_dataset)
#test_dataset = df_test

In [64]:
train_dataset = df.to_pandas()
#test_dataset = df_test.to_pandas()

In [65]:
from catboost import CatBoostClassifier

In [66]:
def process_data(df):
    to_drop = ['region', 'client_id', 'npo_account_id', 'slctn_nmbr']
    drop_on_date = ['lst_pmnt_date_per_qrtr', 'frst_pmnt_date', 'oprtn_sum_per_year']

    df.drop(to_drop, axis=1, inplace=True)

    for column in drop_on_date:
        df[column + '_year'] = pd.to_datetime(df[column]).dt.year
        df[column + '_month'] = pd.to_datetime(df[column]).dt.month
        df[column + '_day'] = pd.to_datetime(df[column]).dt.day

    df.drop(drop_on_date, axis=1, inplace=True)
    
    return df

In [67]:
train_dataset = process_data(train_dataset)
test_dataset = process_data(test_dataset)

In [68]:
test_dataset

Unnamed: 0,npo_accnts_nmbr,pmnts_type,year,quarter,gender,age,clnt_cprtn_time_d,actv_prd_d,lst_pmnt_rcnc_d,balance,oprtn_sum_per_qrtr,frst_pmnt,lst_pmnt,pmnts_sum,pmnts_nmbr,pmnts_sum_per_qrtr,pmnts_sum_per_year,pmnts_nmbr_per_qrtr,pmnts_nmbr_per_year,incm_sum,incm_per_qrtr,incm_per_year,mgd_accum_period,mgd_payment_period,phone_number,email,lk,assignee_npo,assignee_ops,postal_code,citizen,fact_addrss,appl_mrkr,evry_qrtr_pmnt,lst_pmnt_date_per_qrtr_year,lst_pmnt_date_per_qrtr_month,lst_pmnt_date_per_qrtr_day,frst_pmnt_date_year,frst_pmnt_date_month,frst_pmnt_date_day,oprtn_sum_per_year_year,oprtn_sum_per_year_month,oprtn_sum_per_year_day
0,2,1,2017,2,1,54,6595,4543,31,464770.58,10880.44,19164.90,4168.46,342134.88,151,10880.43,39763.52,3,11,122635.69,-0.00,-0.00,0.0,0.0,-1,0,1,-1,-1,398046.0,1,1,0,1,2017.0,6.0,9.0,2004.0,12.0,31.0,1970,1,1
1,1,3,2021,4,1,75,7078,0,2209,221.24,10.54,82.48,82.48,82.48,1,-0.00,-0.00,0,0,138.75,10.53,10.53,5.0,5.0,-1,-1,-1,-1,-1,0.0,1,1,0,0,,,,2015.0,12.0,14.0,1970,1,1
2,1,2,2015,3,1,35,1675,1615,29,27071.14,1850.63,301.96,948.37,24972.87,54,1850.63,6401.21,3,10,2098.26,-0.00,2138.83,0.0,0.0,0,0,-1,-1,-1,666211.0,-1,1,0,0,2015.0,9.0,14.0,2011.0,4.0,13.0,1970,1,1
3,1,1,2015,3,1,50,5180,3538,30,50580.42,1219.51,8455.28,406.50,31951.22,118,1219.51,4878.04,3,12,13909.69,-0.00,1982.57,4.0,4.0,-1,-1,-1,-1,-1,162610.0,1,1,0,1,2015.0,9.0,8.0,2005.0,12.0,31.0,1970,1,1
4,2,1,2011,3,-1,46,296,276,30,7300.73,1747.83,301.83,1353.97,7299.96,10,1747.83,9220.26,3,12,0.76,-0.00,248.13,0.0,0.0,-1,0,-1,-1,-1,427111.0,1,1,0,1,2011.0,9.0,11.0,2010.0,12.0,9.0,1970,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
633429,1,3,2022,3,-1,55,2510,2430,28,301321.98,3298.51,2659.61,3298.51,254564.78,81,3298.51,20151.59,1,7,46757.19,-0.00,-0.00,0.0,0.0,-1,0,1,-1,-1,352800.0,1,1,0,1,2022.0,7.0,12.0,2015.0,11.0,16.0,1970,1,1
633430,1,3,2020,2,1,52,1580,1491,29,383896.62,23074.76,8893.30,250.66,343584.73,52,23074.75,82051.25,3,12,40311.87,-0.00,15456.55,0.0,0.0,-1,0,-1,-1,-1,614016.0,1,1,0,1,2020.0,6.0,10.0,2016.0,5.0,11.0,1970,1,1
633431,1,3,2022,4,1,49,382,365,29,38564.94,8398.63,2527.57,2676.06,38558.49,14,8398.62,36030.92,3,13,6.44,-0.00,-0.00,0.0,0.0,-1,0,-1,-1,-1,628331.0,-1,-1,0,1,2022.0,12.0,14.0,2021.0,12.0,14.0,1970,1,1
633432,1,3,2021,2,1,33,890,825,21,19998.37,1764.69,402.99,725.38,19403.32,28,1764.69,9493.61,3,13,595.04,-0.00,1141.73,0.0,0.0,-1,0,-1,-1,-1,665824.0,1,1,0,1,2021.0,6.0,15.0,2019.0,3.0,13.0,1970,1,1


In [73]:
from sklearn.utils.class_weight import compute_class_weight

X_train, y_train = train_dataset.drop('churn', axis=1), train_dataset['churn']

model = CatBoostClassifier(iterations=777,
                            learning_rate=0.01,
                            depth=2,
                            verbose=100,
                            random_seed=42,
                            eval_metric='F1',
                           
                            )

SyntaxError: keyword argument repeated: verbose (<ipython-input-73-37f2f0e47a75>, line 11)

In [74]:
model.fit(X_train, y_train)

0:	learn: 0.0000000	total: 166ms	remaining: 16.4s
1:	learn: 0.2293447	total: 321ms	remaining: 15.7s
2:	learn: 0.2293447	total: 476ms	remaining: 15.4s
3:	learn: 0.2293447	total: 632ms	remaining: 15.2s
4:	learn: 0.2293447	total: 791ms	remaining: 15s
5:	learn: 0.2972851	total: 947ms	remaining: 14.8s
6:	learn: 0.2807354	total: 1.1s	remaining: 14.6s
7:	learn: 0.2807354	total: 1.27s	remaining: 14.6s
8:	learn: 0.1248740	total: 1.42s	remaining: 14.4s
9:	learn: 0.2747298	total: 1.58s	remaining: 14.2s
10:	learn: 0.2256421	total: 1.73s	remaining: 14s
11:	learn: 0.2293447	total: 1.89s	remaining: 13.8s
12:	learn: 0.2293447	total: 2.04s	remaining: 13.6s
13:	learn: 0.2293447	total: 2.19s	remaining: 13.5s
14:	learn: 0.2293447	total: 2.34s	remaining: 13.3s
15:	learn: 0.2293447	total: 2.49s	remaining: 13.1s
16:	learn: 0.2293447	total: 2.65s	remaining: 12.9s
17:	learn: 0.2294499	total: 2.79s	remaining: 12.7s
18:	learn: 0.2295047	total: 2.95s	remaining: 12.6s
19:	learn: 0.2294499	total: 3.1s	remaining: 12

<catboost.core.CatBoostClassifier at 0x7f23afef4310>

In [75]:
X_test = test_dataset
y_pred = model.predict(X_test)

In [76]:
y_pred.shape

(633434,)

In [77]:
ans_final = test_df[['npo_account_id', 'quarter']]
ans_final['churn'] = y_pred
ans_final.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ans_final['churn'] = y_pred


Unnamed: 0,npo_account_id,quarter,churn
0,0xAA9E8DDF1310724995598EA2B42D7D87,2017Q2,0
1,0x3B9B09857D152F468A42C5DEE6D723F7,2021Q4,0
2,0xA7F12C768A4FB38311E835E4A7632E00,2015Q3,0
3,0xBFE4BDFBE0E037478C29BA4F38121B15,2015Q3,0
4,0x4B5C7804E403C842B4ECBF69BC3EDD7C,2011Q3,0


In [None]:
ans_final.to_csv('1ans.csv')