In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [None]:
data_path = "drugs_train.csv"
data = pd.read_csv(data_path)

In [None]:
# Display basic information about the dataset
print("Dataset Overview:")
print(data.head())
print("\nDataset Info:")
print(data.info())

In [None]:
# Step 2: Exploratory Data Analysis (EDA)
# Check for missing values
print("\nMissing Values:\n", data.isnull().sum())

# Check basic statistics of numerical features
print("\nBasic Statistics:")
print(data.describe())

In [None]:
# Visualize the distribution of the target variable
target_column = "Drug"  # Replace with the actual target column name
sns.countplot(x=target_column, data=data)
plt.title("Target Variable Distribution")
plt.show()

# Visualize correlations (if numerical features are present)
if len(data.select_dtypes(include='number').columns) > 1:
    plt.figure(figsize=(10, 6))
    sns.heatmap(data.corr(), annot=True, cmap='coolwarm')
    plt.title("Correlation Matrix")
    plt.show()

In [None]:
# Step 3: Preprocessing
# Check categorical features
categorical_features = data.select_dtypes(include='object').columns
print("\nCategorical Features:", categorical_features)

# Encode categorical features (if necessary)
from sklearn.preprocessing import LabelEncoder

label_encoders = {}
for col in categorical_features:
    if col != target_column:  # Avoid encoding the target column directly here
        label_encoders[col] = LabelEncoder()
        data[col] = label_encoders[col].fit_transform(data[col])

In [None]:
# Encode the target column
target_encoder = LabelEncoder()
data[target_column] = target_encoder.fit_transform(data[target_column])

# Step 4: Split Data into Train and Test Sets
X = data.drop(columns=[target_column])
y = data[target_column]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Step 5: Train a Classification Model
# Using Random Forest as an example
clf = RandomForestClassifier(random_state=42, n_estimators=100)
clf.fit(X_train, y_train)

# Step 6: Evaluate the Model
y_pred = clf.predict(X_test)

In [None]:
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nAccuracy Score:", accuracy_score(y_test, y_pred))

In [None]:
# Step 7: Visualize Feature Importance
feature_importances = clf.feature_importances_
feature_names = X.columns
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=importance_df)
plt.title("Feature Importance")
plt.show()