In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

In [3]:
# Load the training data
train_data_url = "https://raw.githubusercontent.com/dsrscientist/dataset5/main/termdeposit_train.csv"
train_data = pd.read_csv(train_data_url)

In [4]:
train_data

Unnamed: 0,ID,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,subscribed
0,26110,56,admin.,married,unknown,no,1933,no,no,telephone,19,nov,44,2,-1,0,unknown,no
1,40576,31,unknown,married,secondary,no,3,no,no,cellular,20,jul,91,2,-1,0,unknown,no
2,15320,27,services,married,secondary,no,891,yes,no,cellular,18,jul,240,1,-1,0,unknown,no
3,43962,57,management,divorced,tertiary,no,3287,no,no,cellular,22,jun,867,1,84,3,success,yes
4,29842,31,technician,married,secondary,no,119,yes,no,cellular,4,feb,380,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31642,36483,29,management,single,tertiary,no,0,yes,no,cellular,12,may,116,2,-1,0,unknown,no
31643,40178,53,management,divorced,tertiary,no,380,no,yes,cellular,5,jun,438,2,-1,0,unknown,yes
31644,19710,32,management,single,tertiary,no,312,no,no,cellular,7,aug,37,3,-1,0,unknown,no
31645,38556,57,technician,married,secondary,no,225,yes,no,telephone,15,may,22,7,337,12,failure,no


In [5]:
# Load the test data
test_data_url = "https://raw.githubusercontent.com/dsrscientist/dataset5/main/termdeposit_test.csv"
test_data = pd.read_csv(test_data_url)

In [6]:
test_data

Unnamed: 0,ID,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,38441,32,services,married,secondary,no,118,yes,no,cellular,15,may,20,6,-1,0,unknown
1,40403,78,retired,divorced,primary,no,2787,no,no,telephone,1,jul,372,1,-1,0,unknown
2,3709,31,self-employed,single,tertiary,no,144,yes,no,unknown,16,may,676,1,-1,0,unknown
3,37422,57,services,single,primary,no,3777,yes,no,telephone,13,may,65,2,-1,0,unknown
4,12527,45,blue-collar,divorced,secondary,no,-705,no,yes,unknown,3,jul,111,1,-1,0,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13559,23465,39,management,married,tertiary,no,45,no,no,cellular,28,aug,148,4,-1,0,unknown
13560,11743,54,blue-collar,married,primary,no,2281,yes,no,unknown,20,jun,158,1,-1,0,unknown
13561,28292,35,retired,married,primary,no,285,yes,no,cellular,29,jan,136,1,-1,0,unknown
13562,45163,29,admin.,single,secondary,no,464,no,no,cellular,9,nov,208,2,91,3,success


In [10]:
# Convert categorical variables to numerical using Label Encoding
label_encoder = LabelEncoder()
categorical_cols = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']
for col in categorical_cols:
    train_data[col] = label_encoder.fit_transform(train_data[col])
    test_data[col] = label_encoder.transform(test_data[col])

In [19]:
train_data

Unnamed: 0,ID,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,subscribed
0,26110,56,0,1,3,0,1933,0,0,1,19,9,44,2,-1,0,3,no
1,40576,31,11,1,1,0,3,0,0,0,20,5,91,2,-1,0,3,no
2,15320,27,7,1,1,0,891,1,0,0,18,5,240,1,-1,0,3,no
3,43962,57,4,0,2,0,3287,0,0,0,22,6,867,1,84,3,2,yes
4,29842,31,9,1,1,0,119,1,0,0,4,3,380,1,-1,0,3,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31642,36483,29,4,2,2,0,0,1,0,0,12,8,116,2,-1,0,3,no
31643,40178,53,4,0,2,0,380,0,1,0,5,6,438,2,-1,0,3,yes
31644,19710,32,4,2,2,0,312,0,0,0,7,1,37,3,-1,0,3,no
31645,38556,57,9,1,1,0,225,1,0,1,15,8,22,7,337,12,0,no


In [20]:
test_data

Unnamed: 0,ID,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,38441,32,7,1,1,0,118,1,0,0,15,8,20,6,-1,0,3
1,40403,78,5,0,0,0,2787,0,0,1,1,5,372,1,-1,0,3
2,3709,31,6,2,2,0,144,1,0,2,16,8,676,1,-1,0,3
3,37422,57,7,2,0,0,3777,1,0,1,13,8,65,2,-1,0,3
4,12527,45,1,0,1,0,-705,0,1,2,3,5,111,1,-1,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13559,23465,39,4,1,2,0,45,0,0,0,28,1,148,4,-1,0,3
13560,11743,54,1,1,0,0,2281,1,0,2,20,6,158,1,-1,0,3
13561,28292,35,5,1,0,0,285,1,0,0,29,4,136,1,-1,0,3
13562,45163,29,0,2,1,0,464,0,0,0,9,9,208,2,91,3,2


In [21]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31647 entries, 0 to 31646
Data columns (total 18 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   ID          31647 non-null  int64 
 1   age         31647 non-null  int64 
 2   job         31647 non-null  int64 
 3   marital     31647 non-null  int64 
 4   education   31647 non-null  int64 
 5   default     31647 non-null  int64 
 6   balance     31647 non-null  int64 
 7   housing     31647 non-null  int64 
 8   loan        31647 non-null  int64 
 9   contact     31647 non-null  int64 
 10  day         31647 non-null  int64 
 11  month       31647 non-null  int64 
 12  duration    31647 non-null  int64 
 13  campaign    31647 non-null  int64 
 14  pdays       31647 non-null  int64 
 15  previous    31647 non-null  int64 
 16  poutcome    31647 non-null  int32 
 17  subscribed  31647 non-null  object
dtypes: int32(1), int64(16), object(1)
memory usage: 4.2+ MB


In [24]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13564 entries, 0 to 13563
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   ID         13564 non-null  int64
 1   age        13564 non-null  int64
 2   job        13564 non-null  int64
 3   marital    13564 non-null  int64
 4   education  13564 non-null  int64
 5   default    13564 non-null  int64
 6   balance    13564 non-null  int64
 7   housing    13564 non-null  int64
 8   loan       13564 non-null  int64
 9   contact    13564 non-null  int64
 10  day        13564 non-null  int64
 11  month      13564 non-null  int64
 12  duration   13564 non-null  int64
 13  campaign   13564 non-null  int64
 14  pdays      13564 non-null  int64
 15  previous   13564 non-null  int64
 16  poutcome   13564 non-null  int32
dtypes: int32(1), int64(16)
memory usage: 1.7 MB


In [11]:
# Define features and target
X_train = train_data.drop(['ID', 'subscribed'], axis=1)
y_train = train_data['subscribed']
X_test = test_data.drop('ID', axis=1)

In [12]:
# Initialize Random Forest model
model = RandomForestClassifier(random_state=42)

In [13]:
# Train the model
model.fit(X_train, y_train)

RandomForestClassifier(random_state=42)

In [14]:
# Make predictions on the test set
test_predictions = model.predict(X_test)

In [15]:
test_predictions

array(['no', 'no', 'no', ..., 'no', 'yes', 'no'], dtype=object)

In [16]:
# Convert predictions to DataFrame
output = pd.DataFrame({'ID': test_data['ID'], 'subscribed': test_predictions})

In [17]:
# Save predictions to a CSV file
output.to_csv('term_deposit_predictions.csv', index=False)

In [18]:
output

Unnamed: 0,ID,subscribed
0,38441,no
1,40403,no
2,3709,no
3,37422,no
4,12527,no
...,...,...
13559,23465,no
13560,11743,no
13561,28292,no
13562,45163,yes
