In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import numpy as np

In [6]:

# Load your dataset
# df = pd.read_csv('your_dataset.csv')

# Sample data
data = {
    'feature1': [1, 2, np.nan, 4, 5, 6, 7, 8, 9, 10],
    'feature2': [1.5, 2.5, 3.5, np.nan, 5.5, 6.5, 7.5, 8.5, 9.5, 10.5],
    'feature3': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
    'target': [0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
}
df = pd.DataFrame(data)

## Task 1: Data Preprocessing

In [7]:
## Handle missing values
df.fillna(df.median(), inplace=True)  # Filling missing values with median

## Handle outliers (example using IQR method)
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
df = df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]

## Normalize or scale features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df.drop('target', axis=1))
df_scaled = pd.DataFrame(scaled_features, columns=df.columns[:-1])
df_scaled['target'] = df['target'].values

## Split the data into training and testing sets
X = df_scaled.drop('target', axis=1)
y = df_scaled['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



## Task 2: Feature Engineering

In [8]:
## Creating additional features
X_train['feature1_feature2_ratio'] = X_train['feature1'] / (X_train['feature2'] + 1e-9)
X_test['feature1_feature2_ratio'] = X_test['feature1'] / (X_test['feature2'] + 1e-9)

## Consider time-based features, rolling statistics, and other transformations
# For this example, we assume a 'date' column for time-based features
# df['date'] = pd.date_range(start='1/1/2020', periods=len(df), freq='D')
# df['month'] = df['date'].dt.month
# df['rolling_mean'] = df['feature1'].rolling(window=3).mean()

# Example adding rolling statistics
X_train['feature1_rolling_mean'] = X_train['feature1'].rolling(window=3).mean().fillna(X_train['feature1'].mean())
X_test['feature1_rolling_mean'] = X_test['feature1'].rolling(window=3).mean().fillna(X_test['feature1'].mean())

print("Training Features:\n", X_train.head())
print("Testing Features:\n", X_test.head())

Training Features:
    feature1  feature2  feature3  feature1_feature2_ratio  \
5  0.072739  0.106000  0.174078                 0.686221   
0 -1.745743 -1.660663 -1.566699                 1.051232   
7  0.800132  0.812665  0.870388                 0.984578   
2  0.072739 -0.953998 -0.870388                -0.076247   
9  1.527525  1.519330  1.566699                 1.005394   

   feature1_rolling_mean  
5               0.027277  
0               0.027277  
7              -0.290957  
2              -0.290957  
9               0.800132  
Testing Features:
    feature1  feature2  feature3  feature1_feature2_ratio  \
8  1.163829  1.165998  1.218544                 0.998140   
1 -1.382047 -1.307331 -1.218544                 1.057152   

   feature1_rolling_mean  
8              -0.109109  
1              -0.109109  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_inde