In [2]:
pip install pandas numpy matplotlib seaborn scikit-learn

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.3.1 -> 24.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# File paths
train_data_file = r'C:\Users\Rajan Mishra Ji\Downloads\Train_Dataset.csv'
test_data_file = r'C:\Users\Rajan Mishra Ji\Downloads\Test_Dataset.csv'

# Load the dataset
try:
    train_data = pd.read_csv(train_data_file)
    test_data = pd.read_csv(test_data_file)
except FileNotFoundError as e:
    print(f"Error: {e}")
    print("Ensure the dataset files are in the correct directory.")
    raise

# Verify the column names in training data
print("Columns in training data:\n", train_data.columns)

# Check for 'default' or 'target' columns
target_variable = None
if 'default' in train_data.columns:
    target_variable = 'default'
    print("Found 'default' column in training dataset. Proceeding with it as target variable.")
elif 'target' in train_data.columns:
    target_variable = 'target'
    print("Found 'target' column in training dataset. Proceeding with it as target variable.")
else:
    print("No suitable target variable ('default' or 'target') found in training dataset. Check column names or dataset structure.")
    raise KeyError("Target variable not found.")

# Assign target variable and features
y_train = train_data[target_variable]
X_train = train_data.drop(['ID', target_variable], axis=1)  # Assuming 'ID' and target_variable are dropped for training

# Explore the dataset
print("Train Data Head:\n", X_train.head())
print("\nTrain Data Info:\n", X_train.info())
print("\nTrain Data Description:\n", X_train.describe())

# Check for missing values
print("\nMissing Values in Train Data:\n", X_train.isnull().sum())
print("\nMissing Values in Test Data:\n", test_data.isnull().sum())

# Clean the dataset by removing non-numeric characters
def clean_numeric_columns(df):
    for column in df.columns:
        if df[column].dtype == 'object':
            df[column] = pd.to_numeric(df[column], errors='coerce')
    return df

X_train = clean_numeric_columns(X_train)
test_data = clean_numeric_columns(test_data)

# Handle missing values (if any)
X_train = X_train.fillna(X_train.median())
test_data = test_data.fillna(test_data.median())

# Encode categorical variables
le = LabelEncoder()
for column in X_train.select_dtypes(include=['object']).columns:
    X_train[column] = le.fit_transform(X_train[column].astype(str))
for column in test_data.select_dtypes(include=['object']).columns:
    test_data[column] = le.transform(test_data[column].astype(str))

# Ensure all columns are now numeric
print("\nTrain Data Types:\n", X_train.dtypes)
print("\nTest Data Types:\n", test_data.dtypes)

# Split the training data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
test_data_scaled = scaler.transform(test_data.drop('ID', axis=1))

# Train a RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Validate the model
y_val_pred = model.predict(X_val)
print("Validation Accuracy: ", accuracy_score(y_val, y_val_pred))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred))
print("Classification Report:\n", classification_report(y_val, y_val_pred))

# Predict on the test data
test_predictions = model.predict(test_data_scaled)

# Prepare the submission
submission = pd.DataFrame({'ID': test_data['ID'], target_variable: test_predictions})
submission.to_csv('submission.csv', index=False)
print("Submission file created: submission.csv")


  train_data = pd.read_csv(train_data_file)


Columns in training data:
 Index(['ID', 'Client_Income', 'Car_Owned', 'Bike_Owned', 'Active_Loan',
       'House_Own', 'Child_Count', 'Credit_Amount', 'Loan_Annuity',
       'Accompany_Client', 'Client_Income_Type', 'Client_Education',
       'Client_Marital_Status', 'Client_Gender', 'Loan_Contract_Type',
       'Client_Housing_Type', 'Population_Region_Relative', 'Age_Days',
       'Employed_Days', 'Registration_Days', 'ID_Days', 'Own_House_Age',
       'Mobile_Tag', 'Homephone_Tag', 'Workphone_Working', 'Client_Occupation',
       'Client_Family_Members', 'Cleint_City_Rating',
       'Application_Process_Day', 'Application_Process_Hour',
       'Client_Permanent_Match_Tag', 'Client_Contact_Work_Tag',
       'Type_Organization', 'Score_Source_1', 'Score_Source_2',
       'Score_Source_3', 'Social_Circle_Default', 'Phone_Change',
       'Credit_Bureau', 'Default'],
      dtype='object')
No suitable target variable ('default' or 'target') found in training dataset. Check column names or

  test_data = pd.read_csv(test_data_file)


KeyError: 'Target variable not found.'