### Objective: Figure out the right people to target for the social media ad campaigns 

In [20]:
# Loading Libraries and Dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import seaborn as sns
from skimpy import skim




#Import Data
df1 = pd.read_csv("Data/test.csv", delimiter=";")
df2 = pd.read_csv("Data/train.csv", delimiter=";")
df = pd.concat([df1, df2], axis=0)

### PRE DATA TRANSFORMATION 
# 1. Converting columns with two variables into binary 
df.default = df.default.map({'yes': 1, 'no': 0})
df.housing = df.housing.map({'yes': 1, 'no': 0})
df.loan = df.loan.map({'yes': 1, 'no': 0})
df.y = df.y.map({'yes': 1, 'no': 0})

# 2. Make 'contact' variable binary be removing 'other' and rename to cellular
df.contact = df.contact.map({'cellular': 1, 'telephone': 0})
df.columns = df.columns.str.replace('contact', 'cellular')
df.contact = df.cellular.astype(int, errors='ignore')

## 3. Some values in pday have -1. Replace with NaN
df.pdays.replace(-1, np.nan, inplace=True)

# 4. Rename columns to be more understandable 
df.rename(columns={
    'age': 'client_age',
    'job': 'job_type',
    'marital': 'marital_status',
    'education': 'education_level',
    'default': 'credit_default',
    'balance': 'avg_yearly_balance_eur',
    'housing': 'housing_loan',
    'loan': 'personal_loan',
    'cellular': 'cellular_contact',
    'day': 'last_contact_day',
    'month': 'last_contact_month',
    'duration': 'last_contact_duration',
    'campaign': 'num_contacts_campaign',
    'pdays': 'days_since_last_contact',
    'previous': 'num_prev_contacts',
    'poutcome': 'prev_campaign_outcome',
    'y': 'subscribed_term_deposit'
}, inplace=True)

# 5. Define mapping of job_type categories to more condensed ones
employment_mapping = {
    'management': 'White-Collar',
    'technician': 'White-Collar',
    'admin.': 'White-Collar',
    'self-employed': 'White-Collar',
    'entrepreneur': 'White-Collar',
    'blue-collar': 'Blue-Collar',
    'services': 'Blue-Collar',
    'housemaid': 'Blue-Collar',
    'retired': 'Retired/Unemployed',
    'unemployed': 'Retired/Unemployed',
    'student': 'Student'
}
df['job_type'] = df['job_type'].map(employment_mapping)

# 6. Condense month column to quarters to be more condensed
month_to_quarter = {
    'jan': 'Q1', 'feb': 'Q1', 'mar': 'Q1',
    'apr': 'Q2', 'may': 'Q2', 'jun': 'Q2',
    'jul': 'Q3', 'aug': 'Q3', 'sep': 'Q3',
    'oct': 'Q4', 'nov': 'Q4', 'dec': 'Q4'
}
df['last_contact_month'] = df['last_contact_month'].map(month_to_quarter)

# 7. Drop columns that are not needed due to high NAs
df = df.drop('prev_campaign_outcome', axis=1)
df = df.drop('days_since_last_contact', axis=1)
df.replace("unknown", np.nan, inplace=True)
df.dropna(inplace=True)

# 8. Convert columns to correct data types
df['client_age'] = df['client_age'].astype(int)
df['credit_default'] = df['credit_default'].astype(bool)
df['housing_loan'] = df['housing_loan'].astype(bool)
df['personal_loan'] = df['personal_loan'].astype(bool)
df['cellular_contact'] = df['cellular_contact'].astype(bool)
df['subscribed_term_deposit'] = df['subscribed_term_deposit'].astype(bool)

df['job_type'] = df['job_type'].astype('category')
df['marital_status'] = df['marital_status'].astype('category')
df['education_level'] = df['education_level'].astype('category')
df['last_contact_month'] = df['last_contact_month'].astype('category')

skim(df)

  df.contact = df.cellular.astype(int, errors='ignore')
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df.pdays.replace(-1, np.nan, inplace=True)


### Training Linear Model on Dataset

In [None]:
print('Before')
print(df.columns)

# Make dummy columns
list_category = 
df = pd.get_dummies(df, columns=['device_type'], drop_first=True)


print('After')
print(df.columns)

Index(['client_age', 'job_type', 'marital_status', 'education_level',
       'credit_default', 'avg_yearly_balance_eur', 'housing_loan',
       'personal_loan', 'cellular_contact', 'last_contact_day',
       'last_contact_month', 'last_contact_duration', 'num_contacts_campaign',
       'num_prev_contacts', 'subscribed_term_deposit'],
      dtype='object')


In [24]:
import pandas as pd
import statsmodels.api as sm
import numpy as np

# 1. Load your data
# df = pd.read_csv('your_data.csv')  # example
# Assume target is 'converted' and features are numerical + properly encoded categorical

# 2. Define features and target
X = df[['client_age', 'job_type', 'marital_status', 'education_level',
       'credit_default', 'avg_yearly_balance_eur', 'housing_loan',
       'personal_loan', 'cellular_contact', 'last_contact_day',
       'last_contact_month', 'last_contact_duration', 'num_contacts_campaign',
       'num_prev_contacts']]  # replace with actual features
y = df['subscribed_term_deposit'].astype(int)  # ensure binary (0/1)

# 3. Add constant for intercept
X = sm.add_constant(X)

# 4. Fit logistic regression
model = sm.Logit(y, X)
result = model.fit()

# 5. Summary of results
print(result.summary())

# 6. Optional: Odds ratios
print("\nOdds Ratios:")
print(np.exp(result.params))

ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).