<a href="https://colab.research.google.com/github/Owais981/ANN_SEMESTOR_PRO/blob/main/Task_1_Data_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

 **Task 1: Data Preprocessing for Machine Learning**

**Import Required Libraries**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

**Load Dataset**

In [None]:
url = "/content/Financials (1).csv"
df = pd.read_csv(url)


**isplay Initial Data Info**

In [None]:
print("Initial Data Info:")
print(df.info())
print("\nMissing Values:\n", df.isnull().sum())

Initial Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Segment                700 non-null    object
 1   Country                700 non-null    object
 2    Product               700 non-null    object
 3    Discount Band         700 non-null    object
 4    Units Sold            700 non-null    object
 5    Manufacturing Price   700 non-null    object
 6    Sale Price            700 non-null    object
 7    Gross Sales           700 non-null    object
 8    Discounts             700 non-null    object
 9     Sales                700 non-null    object
 10   COGS                  700 non-null    object
 11   Profit                700 non-null    object
 12  Date                   700 non-null    object
 13  Month Number           700 non-null    int64 
 14   Month Name            700 non-null    object
 15  Year

**Handle Missing Values**

In [None]:
print("Missing values after cleaning and conversion:")
print(df.isnull().sum())

Missing values after cleaning and conversion:
Segment                0
Country                0
Product                0
Discount Band          0
Units Sold             0
Manufacturing Price    0
Sale Price             0
Gross Sales            0
Discounts              0
Sales                  0
COGS                   0
Profit                 0
Date                   0
Month Number           0
Month Name             0
Year                   0
dtype: int64


**Encode Categorical Variables**

In [None]:
# Identify categorical columns (excluding Date and Month Name)
categorical_cols = df.select_dtypes(include='object').columns.tolist()
# Check if 'Date' is in the list before removing
if 'Date' in categorical_cols:
    categorical_cols.remove('Date')
# Check if 'Month Name' is in the list before removing
if 'Month Name' in categorical_cols:
    categorical_cols.remove('Month Name')

# Apply one-hot encoding
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

print("DataFrame after one-hot encoding:")
print(df_encoded.info())

DataFrame after one-hot encoding:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Columns: 3271 entries, Date to  Month Name _ September 
dtypes: bool(3268), int64(2), object(1)
memory usage: 2.2+ MB
None


**Define preprocessing pipeline**

In [None]:
# Identify numeric and categorical columns from the training data
numeric_cols_train = X_train.select_dtypes(include=np.number).columns.tolist()
categorical_cols_train = X_train.select_dtypes(include='object').columns.tolist()

preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numeric_cols_train),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols_train)
], remainder='passthrough') # Keep other columns (if any)

**Prepare Features and Target**

In [None]:
# Assuming 'Profit' is the target variable and is now numeric
target_variable = 'Profit'

# Features (all columns except the target, Date, Month Name, and Year)
# Ensure column names are stripped of whitespace if not done already
df.columns = df.columns.str.strip()
features = df.drop(columns=[target_variable, 'Date', 'Month Name', 'Year']).columns.tolist()

X = df[features]
y = df[target_variable]

print("Features (X) shape:", X.shape)
print("Target (y) shape:", y.shape)

Features (X) shape: (700, 12)
Target (y) shape: (700,)


**Split the Dataset**

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (560, 12)
X_test shape: (140, 12)
y_train shape: (560,)
y_test shape: (140,)


 **Apply Transformer**

In [None]:
# Define columns to clean and convert to numeric
currency_cols = ['Units Sold', 'Manufacturing Price', 'Sale Price', 'Gross Sales', 'Discounts', 'Sales', 'COGS']

# Function to clean and convert currency columns
def clean_currency(col):
    col = col.astype(str).str.replace('$', '', regex=False).str.replace(',', '', regex=False)
    col = col.str.replace(" - ", "0") # Replace " - " with 0 or np.nan if preferred
    col = col.str.replace(r'\((\d+\.?\d*)\)', r'-\1', regex=True) # Handle parentheses for negative numbers
    return pd.to_numeric(col, errors='coerce') # Use errors='coerce' to turn unparseable values into NaN

# Apply cleaning and conversion to currency columns in both train and test sets
for col in currency_cols:
    X_train[col] = clean_currency(X_train[col])
    X_test[col] = clean_currency(X_test[col])

# Redefine numeric and categorical columns after cleaning
numeric_cols_train = X_train.select_dtypes(include=np.number).columns.tolist()
categorical_cols_train = X_train.select_dtypes(include='object').columns.tolist()

# Update the preprocessor with correct column lists
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numeric_cols_train),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols_train)
], remainder='passthrough')

pipeline = Pipeline(steps=[('preprocessor', preprocessor)])
X_train_transformed = pipeline.fit_transform(X_train)
X_test_transformed = pipeline.transform(X_test)

print("X_train_transformed shape:", X_train_transformed.shape)
print("X_test_transformed shape:", X_test_transformed.shape)

X_train_transformed shape: (560, 28)
X_test_transformed shape: (140, 28)


** Print Final Shape**

In [None]:
print("Training Features Shape:", X_train_transformed.shape)
print("Testing Features Shape:", X_test_transformed.shape)


Training Features Shape: (560, 28)
Testing Features Shape: (140, 28)


In [None]:
from google.colab import drive
drive.mount('/content/drive')