Welcome to the CCAC Green Light District Notebook submission. Point to note: Please convert data into format provided in the train data. For details on how we created the features you can email: sahoo14@purdue.edu or vponduri@purdue.edu

# DATA PREPROCESSING

## Import the data

In [10]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Attempt to read the CSV files with a specified encoding if UTF-8 fails
try:
    train_df = pd.read_csv('DIWBB_Training.csv', encoding='utf-8')
except UnicodeDecodeError:
    train_df = pd.read_csv('DIWBB_Training.csv', encoding='windows-1252')  # or 'iso-8859-1' as an alternative

try:
    test_df = pd.read_csv('DIWBB_Test.csv', encoding='utf-8')
except UnicodeDecodeError:
    test_df = pd.read_csv('DIWBB_Test.csv', encoding='windows-1252')  # or 'iso-8859-1' as an alternative

# Proceed with your data processing here


In [11]:
# Removing some features that did not help us in the final code

# Drop columns from train_df
train_df = train_df.drop(['state1', 'state2', 'state3', 'state4'], axis=1)

# Drop columns from test_df
test_df = test_df.drop(['state1', 'state2', 'state3', 'state4'], axis=1)


## Converting the teams to one hot encoding.

In [12]:
import pandas as pd

# Assuming df_train and df_test are your DataFrames
# You would replace the sample data with your actual DataFrames

# Function to one-hot encode the teams across specified columns in a DataFrame
def efficient_one_hot(df):
    team_columns = ['team1', 'team2', 'team3', 'team4']
    # Dynamically identify all unique team names from the team columns
    unique_teams = pd.unique(df[team_columns].values.ravel('K'))
    
    # Create a new DataFrame to hold the one-hot encoded values
    encoded_df = pd.DataFrame(index=df.index)
    
    # One-hot encode each unique team
    for team in unique_teams:
        # Check for the presence of the team in any of the specified columns and encode accordingly
        encoded_df[team] = df[team_columns].isin([team]).any(axis=1).astype(int)
    
    return encoded_df

# Example usage with a DataFrame (replace 'df_train' with your actual DataFrame)
df_train_encoded = efficient_one_hot(train_df)
df_test_encoded = efficient_one_hot(test_df)

# Note: Ensure df_train and df_test are loaded or defined before applying this function.

# This code handles the entire dataset, ensuring all unique teams found in 'team1', 'team2', 'team3', and 'team4' 
# across the DataFrame are included in the one-hot encoding. The result is a DataFrame where each team has its own column,
# indicating presence (1) or absence (0) in any of the team columns for each row.


## Converting 'Unknown' and 'unknown' into nulls to impute later

In [13]:
# Assuming df_train_encoded and df_train are already defined

# Step 1: Drop the "unknown" column from df_train_encoded if it exists
if 'unknown' in df_train_encoded.columns:
    df_train_encoded.drop('unknown', axis=1, inplace=True)
if 'Unknown' in df_train_encoded.columns:
    df_train_encoded.drop('Unknown', axis=1, inplace=True)

    
if 'unknown' in df_test_encoded.columns:
    df_test_encoded.drop('unknown', axis=1, inplace=True)
    
if 'Unknown' in df_test_encoded.columns:
    df_test_encoded.drop('Unknown', axis=1, inplace=True)
# Step 2: Append (or concatenate) the modified df_train_encoded to df_train
# Make sure to align them by the index if they share the same row order
train_df = pd.concat([train_df, df_train_encoded], axis=1)
test_df = pd.concat([test_df, df_test_encoded], axis=1)

# Now, df_train_final contains the original columns from df_train plus the one-hot encoded columns from df_train_encoded, excluding the "unknown" column.



## Additional feature engineering

In [14]:

def feature_engineering(data):
    # Assuming the data has already been preprocessed as per previous steps

    # Feature 2: Duration between the first action and the last purchase
    if 'DaysSinceCustomerFirstWBBActionDate' in data.columns and 'DaysSinceCustomerLastWBBPurchaseDate' in data.columns:
        data['DurationBetweenFirstActionAndLastPurchase'] = data['DaysSinceCustomerFirstWBBActionDate'] - data['DaysSinceCustomerLastWBBPurchaseDate']
    
    # Feature 3: Total number of actions taken by the customer (assuming we have such data)
    # This is a placeholder for demonstration; in practice, you would calculate based on available data
    if 'TotalActionsByCustomer' in data.columns:
        data['TotalActionsByCustomer'] = data['TotalActionsByCustomer']  # Assuming this column is calculated elsewhere

    # Feature 4: Whether the event is in the customer's state
    if 'CustomerState' in data.columns and 'FacilityState' in data.columns:
        data['IsLocalEvent'] = (data['CustomerState'] == data['FacilityState']).astype('int64')
    
    return data

# Apply feature engineering to the preprocessed data
engineered_data = feature_engineering(train_df)
train_df = engineered_data
engineered_data_test = feature_engineering(test_df)
test_df =engineered_data_test 

# Show the first few rows of the data with new features
engineered_data.head()


Unnamed: 0,RecordID,ChampionshipYear,CustomerID,CustomerCity,CustomerState,CustomerZipCode,CustomerInstitutionAffinity,IsCustomerInNCAAMembership,HasCustomerClickedOrOpenedEmailsSixMonthsPrior,CustomerFirstWBBActionDate,...,IUPUI,Marquette,Iona,Gardner-Webb,American,Buffalo,Mercer,Holy cross,Vermont,IsLocalEvent
0,578923,2022,15,Unknown,Unknown,Unknown,Unknown,No,Unknown,3/26/2018,...,0,0,0,0,0,0,0,0,0,1
1,1442480,2023,15,Unknown,Unknown,Unknown,Unknown,No,Unknown,3/26/2018,...,0,0,0,0,0,0,0,0,0,1
2,28140,2022,24,Pasadena,CA,91104,Unknown,No,Unknown,10/28/2013,...,0,0,0,0,0,0,0,0,0,0
3,851863,2023,24,Pasadena,CA,91104,Unknown,No,Unknown,10/28/2013,...,0,0,0,0,0,0,0,0,0,0
4,247590,2022,47,Unknown,Unknown,Unknown,Unknown,No,Unknown,10/28/2013,...,0,0,0,0,0,0,0,0,0,1


## Imputing medians.

In [15]:
import pandas as pd
import numpy as np

# Assuming engineered_data is your DataFrame
engineered_data = train_df
engineered_data_test = test_df
# List of columns to convert to int64
columns_to_convert = ['Distance', 'Age', 'Begin_day']

for column in columns_to_convert:
    # Replace 'Unknown' with np.nan for processing
    engineered_data[column] = engineered_data[column].replace('Unknown', np.nan)
    # Convert column to float to handle both integers and floating points
    engineered_data[column] = engineered_data[column].astype(float)
    # Calculate the median of the column, ignoring NaN values
    median_value = engineered_data[column].median()
    # Replace NaN values with the median value
    engineered_data[column] = engineered_data[column].fillna(median_value)
    # Convert the column to int64 by first rounding and then converting to int
    engineered_data[column] = engineered_data[column].round().astype('int64')

# Check the data types to confirm the conversion
print(engineered_data.dtypes)
train_df = engineered_data

RecordID             int64
ChampionshipYear     int64
CustomerID           int64
CustomerCity        object
CustomerState       object
                     ...  
Buffalo              int64
Mercer               int64
Holy cross           int64
Vermont              int64
IsLocalEvent         int64
Length: 147, dtype: object


## Imputing it for the test set

In [16]:

# Assuming engineered_data is your DataFrame

# List of columns to convert to int64
columns_to_convert = ['Distance', 'Age', 'Begin_day']

for column in columns_to_convert:
    # Replace 'Unknown' with np.nan for processing
    engineered_data_test[column] = engineered_data_test[column].replace('Unknown', np.nan)
    # Convert column to float to handle both integers and floating points
    engineered_data_test[column] = engineered_data_test[column].astype(float)
    # Calculate the median of the column, ignoring NaN values
    median_value = engineered_data_test[column].median()
    # Replace NaN values with the median value
    engineered_data_test[column] = engineered_data_test[column].fillna(median_value)
    # Convert the column to int64 by first rounding and then converting to int
    engineered_data_test[column] = engineered_data_test[column].round().astype('int64')

# Check the data types to confirm the conversion
print(engineered_data_test.dtypes)
test_df = engineered_data_test

RecordID             int64
ChampionshipYear     int64
CustomerID           int64
CustomerCity        object
CustomerState       object
                     ...  
IUPUI                int64
Marquette            int64
Gardner-Webb         int64
James madison        int64
IsLocalEvent         int64
Length: 147, dtype: object


## Checking for datatype inconsistency

In [17]:
import pandas as pd

# Sample DataFrame creation for demonstration; replace it with your actual DataFrame
# train_df = pd.read_csv('your_file.csv')

def find_mixed_type_columns(df):
    mixed_type_columns = {}
    for column in df.columns:
        # Using a set to store unique types found in each column
        unique_types = {type(value).__name__ for value in df[column] if pd.notnull(value)}
        if len(unique_types) > 1:
            mixed_type_columns[column] = unique_types
    return mixed_type_columns

mixed_type_columns = find_mixed_type_columns(train_df)
for column, types in mixed_type_columns.items():
    print(f"Column '{column}' contains these types: {types}")


In [18]:
# No inconsistency found!

## Now preparing to impute medians for other numerical data

In [19]:

train_df.replace('Unknown', np.nan, inplace=True)
test_df.replace('Unknown', np.nan, inplace=True)


In [20]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Load datasets
#train_df = pd.read_csv('DIWBB_Training.csv')
#test_df = pd.read_csv('DIWBB_Test.csv')

train_df.columns = train_df.columns.astype(str)
test_df.columns = test_df.columns.astype(str)

# Identify columns by type
categorical_cols = train_df.select_dtypes(include=['object', 'bool','string']).columns.tolist()
numerical_cols = train_df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Remove target variable and identifiers from feature list
categorical_cols.remove('ActivityType')
numerical_cols.remove('RecordID')

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Preprocess the datasets
X_train = preprocessor.fit_transform(train_df.drop(['RecordID', 'ActivityType'], axis=1))
X_test = preprocessor.transform(test_df.drop('RecordID', axis=1))

# Target variable
y_train = train_df['ActivityType']


## We can see the dimensions of the sparse matrix below

In [21]:
X_train

<209266x33354 sparse matrix of type '<class 'numpy.float64'>'
	with 30343570 stored elements in Compressed Sparse Row format>

In [22]:
X_test

<20935x33354 sparse matrix of type '<class 'numpy.float64'>'
	with 3026526 stored elements in Compressed Sparse Row format>

## Code to see sparse matrix in detail

In [23]:
import pandas as pd
from scipy.sparse import csr_matrix

# Assuming X_train is your sparse matrix
# Convert the sparse matrix to a dense matrix first
dense_matrix = X_train.toarray()

# Generate generic column names
columns = [f'feature_{i}' for i in range(X_train.shape[1])]

# Create a DataFrame
df_xpanded = pd.DataFrame(data=dense_matrix, columns=columns)

# Now df is a full DataFrame of your sparse matrix


In [24]:
df_xpanded

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_33344,feature_33345,feature_33346,feature_33347,feature_33348,feature_33349,feature_33350,feature_33351,feature_33352,feature_33353
0,-1.059492,-1.725058,0.114379,-0.889407,-0.10453,-0.257176,-0.330742,-0.444656,-0.052215,-0.155377,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,0.943848,-1.725058,0.114379,-0.611291,-0.10453,-0.257176,-0.330742,-0.444656,-0.052215,-0.155377,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,-1.059492,-1.725023,0.114379,0.501175,-0.10453,-0.257176,-0.330742,-0.444656,-0.052215,-0.155377,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,0.943848,-1.725023,0.114379,0.779291,-0.10453,-0.257176,-0.330742,-0.444656,-0.052215,-0.155377,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,-1.059492,-1.724934,0.114379,0.501175,-0.10453,-0.257176,-0.330742,-0.444656,-0.052215,-0.155377,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
209261,0.943848,1.729360,0.114379,-2.001873,-0.10453,-0.257176,0.770395,2.146582,-0.052215,-0.155377,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
209262,-1.059492,1.729375,0.114379,0.501175,-0.10453,-0.257176,-0.330742,-0.444656,-0.052215,-0.155377,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
209263,0.943848,1.729375,0.114379,0.779291,-0.10453,-0.257176,-0.330742,-0.444656,-0.052215,-0.155377,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
209264,-1.059492,1.729399,0.114379,0.501175,-0.10453,-0.257176,-0.330742,-0.444656,-0.052215,-0.155377,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [25]:
!pip install xgboost



## Importing the libraries. (Skipped splitting as we are training on all the data)

In [26]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score

# Building our ensemble model


In [None]:
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import VotingClassifier
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score
import joblib  # Import joblib for saving the model

# Assuming X_train, y_train are your full training dataset

# Encode the target variable if necessary
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)


# Initialize models
logistic_model = LogisticRegression(max_iter=10000, random_state=0, C=0.5)
linear_svc = LinearSVC(random_state=96, C=1.0, tol=1e-3, max_iter=15000, dual=False)
xgb_model = XGBClassifier(n_estimators=3000, learning_rate=0.1, random_state=69)

# Calling the callibrated_svc so that model can fit into ensemble.
calibrated_svc = CalibratedClassifierCV(linear_svc, method='sigmoid', cv=5)

# Ensemble model with VotingClassifier
ensemble_model = VotingClassifier(
    estimators=[
        ('logistic', logistic_model),
        ('svc', calibrated_svc),
        ('xgb', xgb_model)
    ],
    voting='soft'
)

# Train the ensemble model on the full training dataset
ensemble_model.fit(X_train, y_train_encoded)
 
 


## Now if we want to see the cross validation of the above ensemble model.

In [18]:
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import VotingClassifier
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score
import joblib  # Import joblib for saving the model

from sklearn.model_selection import cross_val_score

label_encoder = LabelEncoder()
# Assuming X and y are your full dataset and target variable respectively
# Ensure y is encoded if it's categorical
y_encoded = label_encoder.fit_transform(y_train)

# Initialize the models again as needed
logistic_model = LogisticRegression(max_iter=10000, random_state=0, C=0.5)
linear_svc = LinearSVC(random_state=96, C=1.0, tol=1e-3, max_iter=15000, dual=False)
calibrated_svc = CalibratedClassifierCV(estimator=linear_svc, method='sigmoid', cv=5)
xgb_model = XGBClassifier(n_estimators=3000, learning_rate=0.1, random_state=69)

# Initialize the ensemble model with the same configurations
ensemble_model = VotingClassifier(
    estimators=[
        ('logistic', logistic_model),
        ('svc', calibrated_svc),
        ('xgb', xgb_model)
    ],
    voting='soft'
)

# Perform 5-fold cross-validation
# Adjust cv parameter as needed for your application
cv_scores = cross_val_score(ensemble_model, X_train, y_encoded, cv=5, scoring='accuracy')

# Print the cross-validation scores
print("CV Scores:", cv_scores)

# Print the average of the cross-validation scores
print("Average CV Score:", cv_scores.mean())


CV Scores: [0.99084914 0.99137457 0.99108786 0.99139847 0.99130289]

Average CV Score: 0.9912025859597055


# Now seeing the feature importances

In [48]:
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import VotingClassifier
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score
import joblib  # Import joblib for saving the model



label_encoder = LabelEncoder()
# Assuming X and y are your full dataset and target variable respectively
# Ensure y is encoded if it's categorical
y_encoded = label_encoder.fit_transform(y_train)

# Directly fit an XGBoost model
xgb_model = XGBClassifier(n_estimators=3000, learning_rate=0.1, random_state=69)
xgb_model.fit(X_train, y_encoded)


# Get feature importances
feature_importances = xgb_model.feature_importances_

# Assuming you have a way to accurately retrieve or define feature names
# This step might require custom handling based on how your pipeline is set up
feature_names = np.array([f"Feature {i}" for i in range(X_train.shape[1])])  # Placeholder feature names

# Combine feature names with their importances and sort them
features_and_importances = sorted(zip(feature_names, feature_importances), key=lambda x: x[1], reverse=True)

# Display the top 10 feature importances
print("Top 10 Feature Importances from XGBoost:")
for feature, importance in features_and_importances[:10]:
    print(f"{feature}: {importance}")


Top 10 Feature Importances from XGBoost:

Feature 24271: 0.40902844071388245

Feature 24272: 0.2712523937225342

Feature 24387: 0.07418902963399887

Feature 24275: 0.030830474570393562

Feature 24267: 0.008823391050100327

Feature 24270: 0.004186241887509823

Feature 24543: 0.0037762883584946394

Feature 0: 0.003322502365335822

Feature 24269: 0.002695696661248803

Feature 24471: 0.0025133206509053707


In [50]:
# Assuming preprocessor is your ColumnTransformer

# Use get_feature_names_out to retrieve the transformed feature names
# For versions of scikit-learn prior to 1.0, you might need to use get_feature_names() method with adjustments
feature_names = preprocessor.get_feature_names_out()

# N

# Get feature importances
feature_importances = xgb_model.feature_importances_

# Combine feature names with their importances and sort them
features_and_importances = sorted(zip(feature_names, feature_importances), key=lambda x: x[1], reverse=True)

# Display the top feature importances
print("Top Feature Importances from XGBoost:")
for feature, importance in features_and_importances[:10]:
    print(f"{feature}: {importance}")


Top Feature Importances from XGBoost:

cat__EventRoundName_missing: 0.40902844071388245

cat__IsEventFinalSite_No: 0.2712523937225342

cat__FacilityDescription_Professional Sports Arena: 0.07418902963399887

cat__EventSession_All-Session: 0.030830474570393562

cat__EventRoundName_Finals: 0.008823391050100327

cat__EventRoundName_Regionals: 0.004186241887509823

cat__Check_No: 0.0037762883584946394

num__ChampionshipYear: 0.003322502365335822

cat__EventRoundName_First and Second Rounds: 0.002695696661248803

cat__FacilityZipCode_missing: 0.0025133206509053707


In [None]:
from sklearn.model_selection import cross_val_score

# Assuming X and y_encoded are your features and encoded target variable respectively

# Initialize your models
logistic_model = LogisticRegression(max_iter=10000, random_state=0, C=0.5)
linear_svc = LinearSVC(random_state=96, C=1.0, tol=1e-3, max_iter=15000, dual=False)
calibrated_svc = CalibratedClassifierCV(estimator=linear_svc, method='sigmoid', cv=5)
xgb_model = XGBClassifier(n_estimators=3000, learning_rate=0.1, random_state=69)

# Calculate cross-validated accuracies
cv_score_logistic = cross_val_score(logistic_model, X_train, y_encoded, cv=5, scoring='accuracy').mean()
cv_score_svc = cross_val_score(calibrated_svc, X_train, y_encoded, cv=5, scoring='accuracy').mean()
cv_score_xgb = cross_val_score(xgb_model, X_train, y_encoded, cv=5, scoring='accuracy').mean()

print("Cross-validated Accuracy Scores:")
print(f"Logistic Regression: {cv_score_logistic:.4f}")
print(f"Calibrated Linear SVC: {cv_score_svc:.4f}")
print(f"XGBoost Classifier: {cv_score_xgb:.4f}")


## Test data. Remember test data has been transformed before

In [26]:

df = X_test

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score

# Initialize the models


# Make predictions
ensemble_predictions = ensemble_model.predict(df)



## Counting predictions

In [27]:
from collections import Counter


# Count the occurrences of each unique value in the array
value_counts = Counter(ensemble_predictions)

# Display the counts
print(value_counts)

Counter({1: 19118, 3: 1356, 5: 380, 4: 57, 0: 20, 2: 4})


In [28]:
ensemble_predictions_labels = label_encoder.inverse_transform(ensemble_predictions)

# Now, count the occurrences of each unique value in the array of labels
value_counts_labels = Counter(ensemble_predictions_labels)

# Display the counts with categories
print(value_counts_labels)

Counter({'No Activity': 19118, 'Primary Purchase': 1356, 'Transfer Recipient': 380, 'Secondary Purchase': 57, 'Multiple Activities': 20, 'Other Secondary Activity': 4})


In [29]:
ensemble_predictions_labels = label_encoder.inverse_transform(ensemble_predictions)

# Now, count the occurrences of each unique value in the array of labels
value_counts_labels = Counter(ensemble_predictions_labels)

# Display the counts with categories
print(value_counts_labels)

Counter({'No Activity': 19118, 'Primary Purchase': 1356, 'Transfer Recipient': 380, 'Secondary Purchase': 57, 'Multiple Activities': 20, 'Other Secondary Activity': 4})


# Exporting

In [30]:
import pandas as pd

# Assuming logistic_predictions is your array

# Convert the array to a pandas DataFrame
df = pd.DataFrame(ensemble_predictions_labels, columns=['ActivityType'])

# Export the DataFrame to a CSV file
df.to_csv('lastdata_28th_ensemble_droppedmulticollinearcolumns.csv', index=False)