In [3]:
import pandas as pd
import numpy as np

In [4]:
result_df = pd.read_csv('round1.csv')

In [5]:
# Assuming your original DataFrame is named 'original_df'
# Select all rows with activation = 1
activation_1_df = result_df[result_df['ind_recommended'] == 1]

# Randomly select 200,000 rows with activation = 0
activation_0_df = result_df[result_df['ind_recommended'] == 0].sample(n=3000000, random_state=42)

# Concatenate the two DataFrames to create the final DataFrame
df = pd.concat([activation_1_df, activation_0_df])

# Reset the index of the new DataFrame
df = df.reset_index(drop=True)

In [6]:
# Store the 'activation' column separately
activation_column = df['activation']

# Remove the 'activation' column from the DataFrame
df = df.drop(columns=['activation','ind_recommended'])

In [7]:
import random

# Replace missing values with random sampling from observed categories
observed_categories = df['merchant_profile_01'].dropna().unique()
df['merchant_profile_01'].fillna(random.choice(observed_categories), inplace=True)

In [6]:
# encoded_df = pd.get_dummies(df, columns=['merchant_profile_01'], prefix='merchant')

In [8]:
columns_to_drop = ['customer_merchant_01', 'customer_digital_activity_07', 'customer_digital_activity_08', 'customer_digital_activity_09', 'customer_digital_activity_18', 'merchant_spend_11']

In [9]:
df.drop(columns_to_drop,axis=1,inplace=True)

In [None]:
# pip install xgboost

In [10]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score

In [11]:
X = df  # Replace 'target_column' with the name of your target column
y = activation_column  # Replace 'target_column' with the name of your target column

# X is your feature matrix, and y is your target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, stratify=y, random_state=42)

In [12]:
model = xgb.XGBClassifier(objective='binary:logistic', random_state=42)  # For binary classification
# If you have more than two classes, set objective='multi:softprob' and adjust num_class accordingly

# Train the model on the training data
model.fit(X_train, y_train)

In [53]:
# Make predictions on the test set
y_pred = model.predict_proba(X)

# Evaluate the model (use appropriate metrics for your recommendation problem)
# accuracy = precision_score(y_test, y_pred)
# recall = recall_score(y_test, y_pred)
# print(f"Accuracy: {accuracy}")
# print(recall)

In [13]:
eval_df = pd.read_csv('Amex_Campus_Challenge_Eval_Round1.csv')

In [14]:
observed_categories = eval_df['merchant_profile_01'].dropna().unique()
eval_df['merchant_profile_01'].fillna(random.choice(observed_categories), inplace=True)

In [16]:
column_list = X_train.columns.tolist()

In [15]:
eval_df.drop(columns_to_drop,axis=1,inplace=True)

In [17]:
eval_df = eval_df[column_list]
eval_df.head()

Unnamed: 0,customer_digital_activity_04,customer_spend_01,customer_industry_spend_01,customer_industry_spend_02,customer_industry_spend_03,customer_industry_spend_04,customer_industry_spend_05,customer_spend_02,customer_spend_03,customer_merchant_02,...,merchant_spend_09,merchant_profile_03,customer_digital_activity_01,merchant_spend_10,customer_profile_03,customer_digital_activity_02,customer_profile_04,distance_05,customer,merchant
0,,112.334,80.5525,9.0,966.63,12.0,10.0,4.0,41.0,,...,26299.0,4777.0,0.8,33.3,72.268283,7.0,423.0,1.621171,467915,599167
1,,112.334,,,,,,4.0,41.0,,...,7122.0,4803.0,0.8,793.29,72.268283,7.0,423.0,2.441944,467915,686617
2,,112.334,71.1925,3.0,284.77,4.0,4.0,4.0,41.0,,...,7222.0,14860.0,0.8,100.0,72.268283,7.0,423.0,2.438082,467915,829193
3,,112.334,,,,,,4.0,41.0,,...,11410.0,11968.0,0.8,252.38,72.268283,7.0,423.0,2.072182,467915,1077034
4,,302.7925,,,,,,3.0,37.0,,...,1847.0,5842.0,0.8,87.5,72.268283,7.0,423.0,2.380853,467915,876647


In [18]:
# Set the batch size
batch_size = 500000

# Initialize an empty array to store predictions
all_predictions = []
concatenated_list = []
# Batch processing loop
for i in range(0, eval_df.shape[0], batch_size):
    batch_start = i
    batch_end = min(i + batch_size, eval_df.shape[0])
    batch_data = eval_df.iloc[batch_start:batch_end]

    # Make predictions on the current batch
    batch_predictions = model.predict_proba(batch_data)

    # Append batch predictions to the overall predictions
    all_predictions.append(batch_predictions)
    print(i)

# Concatenate all batch predictions into a single DataFrame
for sublist in all_predictions:
    concatenated_list.extend(sublist)

# Now, 'final_predictions' contains the predictions for the entire feature DataFrame


0
500000
1000000
1500000
2000000
2500000
3000000
3500000
4000000
4500000
5000000
5500000
6000000
6500000
7000000
7500000
8000000


In [20]:
final_df = eval_df[['customer', 'merchant']]

In [21]:
new_list = [arr[1] for arr in concatenated_list]


In [22]:
final_df['predicted_score'] = new_list

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['predicted_score'] = new_list


In [23]:
final_df.head()

Unnamed: 0,customer,merchant,predicted_score
0,467915,599167,0.000139
1,467915,686617,0.000133
2,467915,829193,0.000157
3,467915,1077034,0.000713
4,467915,876647,0.000495


In [24]:
final_df.to_csv('final_submission.csv', index=False)

In [50]:
fdf = pd.read_csv('submission.csv')

In [51]:
fdf.head()

Unnamed: 0,customer,merchant,predicted_score
0,158549,506141,0.006768
1,332289,104507,0.002835
2,382952,540094,0.000495
3,82857,62454,8.5e-05
4,434359,308336,0.00104


In [53]:
dep_df = f_encoded_df[['customer', 'merchant', 'ind_recommended']]

In [55]:
dep_df['activation'] = f_activation_column

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dep_df['activation'] = f_activation_column


In [59]:
dep_df.to_csv('dep_var.csv', index = False)

In [44]:
# unique_values, counts = np.unique(y_pred, return_counts=True)

# # Print unique values and their counts
# for value, count in zip(unique_values, counts):
#     print(f"{value}: {count}")

0: 3265470
1: 845


In [45]:
# y_test.value_counts()

activation
0    3247224
1      19091
Name: count, dtype: int64

In [38]:
# y_actual = model.predict(X)

In [39]:
# unique_values, counts = np.unique(y_actual, return_counts=True)

# # Print unique values and their counts
# for value, count in zip(unique_values, counts):
#     print(f"{value}: {count}")

0: 3713848
1: 369045


In [40]:
# y.value_counts()

ind_recommended
0    3000000
1    1082893
Name: count, dtype: int64