In [3]:
# SECTION - 1
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# 1. Load the original dataset
df = pd.read_csv('Downloads/student.csv')

# 2. Feature Engineering: Add 'Total' score
subject_cols = ['Math', 'Physics', 'Chemistry', 'Biology', 'English']
df['Total'] = df[subject_cols].sum(axis=1)

# 3. Save the enhanced dataset
df.to_csv('Downloads/student_with_total.csv', index=False)
print('Enhanced dataset saved as student_with_total.csv.')

# 4. Prepare features and target
# Predict 'Math' score (you can change to any subject you want as the target)
X = df[subject_cols + ['Total']].drop('Math', axis=1)
y = df['Math']

# 5. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Hyperparameter Tuning with GridSearchCV (Random Forest example)
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [4, 6, 8, None],
    'min_samples_split': [2, 4, 8]
}
model = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# 7. Best model performance and output
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Best Model Parameters: {grid_search.best_params_}')
print(f'Test MSE: {mse:.2f}')


Enhanced dataset saved as student_with_total.csv.
Best Model Parameters: {'max_depth': 6, 'min_samples_split': 2, 'n_estimators': 50}
Test MSE: 569.95


In [4]:
# SECTION - 2
# 1. Load & Preprocess the Dataset
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, recall_score, f1_score

# Load dataset
df = pd.read_csv('Downloads/fraud_detection.csv')

# Inspect missing values
print(df.isnull().sum())

# Label Encoding for 'Type'
le = LabelEncoder()
df['Type_encoded'] = le.fit_transform(df['Type'])

# 2. Feature Engineering
# Example: Create 'Large_Transaction' for amounts above median
df['Large_Transaction'] = (df['Amount'] > df['Amount'].median()).astype(int)
# Example: Transaction ID endswith digit (parity feature)
df['ID_last_digit'] = df['Transaction ID'].str[-1].astype(int) % 2

# 3. Train a Decision Tree Classifier
X = df[['Amount', 'Type_encoded', 'Large_Transaction', 'ID_last_digit']]
y = df['Is Fraud']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

# 4. Evaluate Model Performance
y_pred = model.predict(X_test)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("\nPrecision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)

# 5. Recommendations for Improvement
print(\"\"\"
Recommendations:
- Try ensemble models like Random Forest for improved accuracy.
- Engineer more features (e.g., time-based, account profile, amount patterns).
- Apply anomaly detection for unsupervised insights.
- Tune hyperparameters for optimal tree depth and splits.
- Use SMOTE or resampling for imbalanced classes.
\"\"\")


SyntaxError: unexpected character after line continuation character (3309606854.py, line 45)