<a href="https://colab.research.google.com/github/Rathodkavya/CODSOFT-TASK-3/blob/main/AMEX_CHALLENGE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import gc


In [3]:
# Memory reduction helper
def reduce_mem_usage(df):
    for col in df.columns:
        col_type = df[col].dtype

        if col_type == object or str(col_type) == "category":
            df[col] = df[col].astype("category")

        elif str(col_type).startswith("int"):
            df[col] = pd.to_numeric(df[col], downcast='integer')

        elif str(col_type).startswith("float"):
            df[col] = pd.to_numeric(df[col], downcast='float')

    return df

# Load data and optimize memory
def load_parquet_optimized(path):
    df = pd.read_parquet(path)
    df = reduce_mem_usage(df)
    return df

# Load train data
train_df = load_parquet_optimized("/content/train_data.parquet")

In [4]:
# Drop columns with more than 40% missing
null_threshold = 0.4
null_ratios = train_df.isnull().mean()
high_null_cols = null_ratios[null_ratios > null_threshold].index.tolist()

train_df.drop(columns=high_null_cols, inplace=True)
print(f"Dropped columns: {high_null_cols}")

Dropped columns: ['f1', 'f2', 'f3', 'f4', 'f7', 'f11', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21', 'f29', 'f33', 'f34', 'f35', 'f36', 'f37', 'f40', 'f42', 'f43', 'f48', 'f57', 'f64', 'f66', 'f70', 'f78', 'f79', 'f80', 'f81', 'f82', 'f83', 'f84', 'f88', 'f92', 'f112', 'f114', 'f115', 'f116', 'f117', 'f118', 'f119', 'f120', 'f121', 'f122', 'f135', 'f136', 'f154', 'f176', 'f187', 'f188', 'f189', 'f190', 'f191', 'f192', 'f193', 'f194', 'f195', 'f196', 'f197', 'f205', 'f206', 'f207', 'f208', 'f209', 'f210', 'f211', 'f212', 'f218', 'f220', 'f221', 'f360']


In [5]:
# Fill missing values efficiently
for col in train_df.columns:
    if train_df[col].isnull().any():
        if train_df[col].dtype.name == 'category':
            train_df[col] = train_df[col].cat.add_categories("MISSING").fillna("MISSING")
        elif train_df[col].dtype.kind in 'biufc':  # Numeric
            train_df[col].fillna(train_df[col].median(), inplace=True)
        else:  # Fallback for object
            train_df[col] = train_df[col].fillna("MISSING")


In [6]:
# Convert object columns with low unique ratio to category
for col in train_df.select_dtypes(include='object').columns:
    if train_df[col].nunique() / len(train_df) < 0.5:
        train_df[col] = train_df[col].astype('category')

# Reduce memory again after type conversions
train_df = reduce_mem_usage(train_df)
gc.collect()

50

In [7]:
print(f"Final shape: {train_df.shape}")
print(f"Total missing values: {train_df.isnull().sum().sum()}")
print(f"Estimated memory usage: {train_df.memory_usage().sum() / 1024**2:.2f} MB")

Final shape: (770164, 297)
Total missing values: 0
Estimated memory usage: 456.79 MB


In [8]:
print(train_df.columns.tolist())

['id1', 'id2', 'id3', 'id4', 'id5', 'y', 'f5', 'f6', 'f8', 'f9', 'f10', 'f12', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f30', 'f31', 'f32', 'f38', 'f39', 'f41', 'f44', 'f45', 'f46', 'f47', 'f49', 'f50', 'f51', 'f52', 'f53', 'f54', 'f55', 'f56', 'f58', 'f59', 'f60', 'f61', 'f62', 'f63', 'f65', 'f67', 'f68', 'f69', 'f71', 'f72', 'f73', 'f74', 'f75', 'f76', 'f77', 'f85', 'f86', 'f87', 'f89', 'f90', 'f91', 'f93', 'f94', 'f95', 'f96', 'f97', 'f98', 'f99', 'f100', 'f101', 'f102', 'f103', 'f104', 'f105', 'f106', 'f107', 'f108', 'f109', 'f110', 'f111', 'f113', 'f123', 'f124', 'f125', 'f126', 'f127', 'f128', 'f129', 'f130', 'f131', 'f132', 'f133', 'f134', 'f137', 'f138', 'f139', 'f140', 'f141', 'f142', 'f143', 'f144', 'f145', 'f146', 'f147', 'f148', 'f149', 'f150', 'f151', 'f152', 'f153', 'f155', 'f156', 'f157', 'f158', 'f159', 'f160', 'f161', 'f162', 'f163', 'f164', 'f165', 'f166', 'f167', 'f168', 'f169', 'f170', 'f171', 'f172', 'f173', 'f174', 'f175', 'f177', 'f178', 'f179', 'f180', 

In [9]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
import pandas as pd

# ✅ Sample 20% for fast processing
sampled_df = train_df.sample(frac=0.2, random_state=42)

# ✅ Separate target and features
y = sampled_df['y']
X = sampled_df.drop(columns=['y'])

# ✅ Split for quick validation
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# ✅ Train a lightweight LightGBM model
model = LGBMClassifier(
    n_estimators=50,
    max_depth=4,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)
model.fit(X_train, y_train)

# ✅ Get feature importances
importances = pd.DataFrame({
    'Feature': X.columns,
    'Importance': model.feature_importances_
}).sort_values(by='Importance', ascending=False)

# ✅ Show top 20 important features
print("🔝 Top 20 Important Features:")
print(importances.head(20).to_string(index=False))

[LightGBM] [Info] Number of positive: 5879, number of negative: 117347
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.924457 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 393565
[LightGBM] [Info] Number of data points in the train set: 123226, number of used features: 258
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.047709 -> initscore=-2.993749
[LightGBM] [Info] Start training from score -2.993749
🔝 Top 20 Important Features:
Feature  Importance
   f350          48
   f354          46
    f52          36
    f53          27
    id5          26
    id3          21
   f285          19
   f144          19
   f125          18
    f50          18
    f98          17
    f23          17
   f131          16
   f365          16
    f96          15
    f95          15
   f364          15
   f363          12
    f99          12
   f366          11


In [11]:
model = LGBMClassifier(
    n_estimators=100,
    max_depth=-1,  # Let it grow freely
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)


In [16]:
print(train_df.columns)


Index(['id1', 'id2', 'id3', 'id4', 'id5', 'y', 'f5', 'f6', 'f8', 'f9',
       ...
       'f356', 'f357', 'f358', 'f359', 'f361', 'f362', 'f363', 'f364', 'f365',
       'f366'],
      dtype='object', length=297)


In [19]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
import pandas as pd

# Step 1: Sample 20% of the data for quick importance check
sampled_df = train_df.sample(frac=0.2, random_state=42)

# Step 2: Split features and target
X = sampled_df.drop(columns=['y'])
y = sampled_df['y']

# Step 3: Train-validation split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Step 4: Train a light model
model = LGBMClassifier(
    n_estimators=50,
    max_depth=4,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)
model.fit(X_train, y_train)

# Step 5: Extract feature importances
importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': model.feature_importances_
}).sort_values(by='Importance', ascending=False)


[LightGBM] [Info] Number of positive: 5879, number of negative: 117347
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.329396 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 393565
[LightGBM] [Info] Number of data points in the train set: 123226, number of used features: 258
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.047709 -> initscore=-2.993749
[LightGBM] [Info] Start training from score -2.993749


In [20]:
top_features = importance_df['Feature'].head(50).tolist()


In [21]:
model_df = train_df[top_features + ['y']].copy()


In [22]:
model_df

Unnamed: 0,f350,f354,f52,f53,id5,id3,f285,f144,f125,f50,...,f358,f342,f132,f140,f227,f275,f315,f22,f77,y
0,80458,Phase_1,MISSING,MISSING,2023-11-02,189706075,1.0,0.0,1.0,MISSING,...,-9999.0,1.2282112563054637,0.0005313496280552,0.0909090909090909,0.0,0.0,MISSING,2.0,0.2156862745098039,0
1,85874,MISSING,MISSING,MISSING,2023-11-01,89227,0.0,0.0,1.0,MISSING,...,MISSING,1.112347267604396,0.0005313496280552,0.0909090909090909,0.0,0.0,MISSING,1.0,0.2156862745098039,0
2,1855,MISSING,MISSING,MISSING,2023-11-01,35046,1.0,0.0,1.0,MISSING,...,MISSING,1.1796587263277778,0.0006553079947575,0.1428571428571428,0.0,0.0,MISSING,1.0,0.0909090909090909,0
3,80458,Phase_1,MISSING,MISSING,2023-11-02,6275451,1.0,0.0,1.0,MISSING,...,-9999.0,1.1988587335904886,0.0005313496280552,0.0909090909090909,0.0,0.0,MISSING,2.0,0.2156862745098039,0
4,80458,Phase_1,MISSING,MISSING,2023-11-02,78053,1.0,0.0,1.0,MISSING,...,-9999.0,1.2988381024516549,0.0005313496280552,0.0909090909090909,0.0,0.0,MISSING,2.0,0.2156862745098039,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
770159,29659,Rest,N,MISSING,2023-11-02,87731,1.0,0.0,2.0,N,...,0.0404038812929242,1.0665911311444454,0.0048076923076923,0.0,0.0,0.0,225.0,1.0,0.2699619771863117,0
770160,29659,Rest,N,MISSING,2023-11-02,505604,0.0,0.0,2.0,N,...,0.0506497784165768,1.0531005836245653,0.0048076923076923,0.0,0.0,0.0,199.0,1.0,0.2699619771863117,0
770161,29659,Rest,N,MISSING,2023-11-02,25212,0.0,0.0,2.0,N,...,0.0498707994939233,MISSING,0.0048076923076923,0.0,0.0,0.0,211.0,1.0,0.2699619771863117,0
770162,22103,MISSING,MISSING,MISSING,2023-11-02,95157,0.0,MISSING,0.0,MISSING,...,MISSING,1.125242322387316,0.0,MISSING,0.0,1.0,MISSING,MISSING,0.0555555555555555,0


In [23]:
X = model_df.drop(columns=['y'])
y = model_df['y']


In [24]:
from lightgbm import LGBMClassifier

final_model = LGBMClassifier(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=-1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

final_model.fit(X, y)


[LightGBM] [Info] Number of positive: 37051, number of negative: 733113
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.155603 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 111041
[LightGBM] [Info] Number of data points in the train set: 770164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.048108 -> initscore=-2.985005
[LightGBM] [Info] Start training from score -2.985005


In [25]:
train_df['click_prob'] = final_model.predict_proba(X)[:, 1]


In [29]:
# Rank top 7 offers per customer by predicted probability
top_offers = (
    train_df[['id1', 'id2', 'click_prob']]
    .sort_values(['id1', 'click_prob'], ascending=[True, False])
    .groupby('id1')
    .head(7)
)

# Convert to final submission format
submission = top_offers.groupby('id1')['id2'].apply(list).reset_index()
submission.columns = ['id1', 'recommended_offers']

# Optional: format as space-separated string
submission['recommended_offers'] = submission['recommended_offers'].apply(lambda x: ' '.join(map(str, x)))

# Save to CSV
submission.to_csv("r2_submission_file_1_<OPTIMISERS>.csv", index=False)

  .groupby('id1')
  submission = top_offers.groupby('id1')['id2'].apply(list).reset_index()


In [27]:
top_offers = (
    train_df[['id1', 'id2', 'click_prob']]
    .sort_values(['id1', 'click_prob'], ascending=[True, False])
    .groupby('id1', observed=True)
    .head(7)
)

submission = top_offers.groupby('id1', observed=True)['id2'].apply(list).reset_index()
