In [5]:
# Import necessary libraries
import pandas as pd
import numpy as np

# Sample DataFrame with missing values
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Edward'],
    'Age': [25, np.nan, 35, 45, np.nan],
    'Salary': [50000, 60000, np.nan, 80000, 120000],
    'Gender': ['Female', 'Male', np.nan, 'Male', 'Female']
}

df = pd.DataFrame(data)

In [6]:
# Display the DataFrame with missing values
print("Original DataFrame:")
print(df)




Original DataFrame:
      Name   Age    Salary  Gender
0    Alice  25.0   50000.0  Female
1      Bob   NaN   60000.0    Male
2  Charlie  35.0       NaN     NaN
3    David  45.0   80000.0    Male
4   Edward   NaN  120000.0  Female


In [7]:


# 1. Identifying Missing Values
print("\nChecking for Missing Values:")
print(df.isnull().sum())


Checking for Missing Values:
Name      0
Age       2
Salary    1
Gender    1
dtype: int64


In [8]:

# 2. Handling Missing Values

# 2.1 Dropping rows with missing values
df_dropna = df.dropna()
print("\nDataFrame after dropping rows with missing values:")
print(df_dropna)


DataFrame after dropping rows with missing values:
    Name   Age   Salary  Gender
0  Alice  25.0  50000.0  Female
3  David  45.0  80000.0    Male


In [9]:

# 2.2 Filling missing values with a specific value (e.g., 0)
df_fill_0 = df.fillna(0)
print("\nDataFrame after filling missing values with 0:")
print(df_fill_0)


DataFrame after filling missing values with 0:
      Name   Age    Salary  Gender
0    Alice  25.0   50000.0  Female
1      Bob   0.0   60000.0    Male
2  Charlie  35.0       0.0       0
3    David  45.0   80000.0    Male
4   Edward   0.0  120000.0  Female


In [10]:



# 2.3 Filling missing values with the mean of the column
df['Age'] = df['Age'].fillna(df['Age'].mean())
df['Salary'] = df['Salary'].fillna(df['Salary'].mean())
print("\nDataFrame after filling missing values with mean:")
print(df)


DataFrame after filling missing values with mean:
      Name   Age    Salary  Gender
0    Alice  25.0   50000.0  Female
1      Bob  35.0   60000.0    Male
2  Charlie  35.0   77500.0     NaN
3    David  45.0   80000.0    Male
4   Edward  35.0  120000.0  Female


In [11]:

# 2.4 Filling missing values with the mode (for categorical data)
df['Gender'] = df['Gender'].fillna(df['Gender'].mode()[0])
print("\nDataFrame after filling missing values with mode for categorical column:")
print(df)        


DataFrame after filling missing values with mode for categorical column:
      Name   Age    Salary  Gender
0    Alice  25.0   50000.0  Female
1      Bob  35.0   60000.0    Male
2  Charlie  35.0   77500.0  Female
3    David  45.0   80000.0    Male
4   Edward  35.0  120000.0  Female


In [12]:


# 3. Summary of missing values handling
print("\nFinal DataFrame after handling all missing values:")
print(df)


Final DataFrame after handling all missing values:
      Name   Age    Salary  Gender
0    Alice  25.0   50000.0  Female
1      Bob  35.0   60000.0    Male
2  Charlie  35.0   77500.0  Female
3    David  45.0   80000.0    Male
4   Edward  35.0  120000.0  Female


In [6]:
""" In Jupyter, you can use LabelEncoder and OneHotEncoder from sklearn.preprocessing to transform 
  categorical data into numerical format. Below is the Python code that demonstrates how to apply 
  Label Encoding and One-Hot Encoding using a sample dataset.

Explanation:
Label Encoding: Converts each category into a numeric value. For instance, 'Male' could be encoded
as 0, and 'Female' as 1.
One-Hot Encoding: Creates binary columns for each category. For example, 'Male' and 'Female' 
would become two separate columns with binary values (0 or 1)."""

# Import necessary libraries
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
# Sample DataFrame with categorical data
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Edward'],
    'Gender': ['Female', 'Male', 'Male', 'Male', 'Female'],
    'Country': ['USA', 'UK', 'Germany', 'France', 'USA']
}
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)
# 1. Label Encoding
# Label Encoding for Gender column
label_encoder = LabelEncoder()
df['Gender_LabelEncoded'] = label_encoder.fit_transform(df['Gender'])
print("\nDataFrame after Label Encoding for 'Gender':")
print(df)

# 2. One-Hot Encoding
# One-Hot Encoding for Country column
one_hot_encoder = OneHotEncoder(sparse_output=False)  # Set sparse=False to return a dense array
#even though most of the values are 0, they are still stored in memory and shown when you print the array.
#sparse array only store the non-zero elements and their positions.
one_hot_encoded = one_hot_encoder.fit_transform(df[['Country']]) 
#sparse array is an array where most of the elements are zero

# Convert the array back to a DataFrame for better visualization
one_hot_df = pd.DataFrame(one_hot_encoded, columns=
one_hot_encoder.get_feature_names_out(['Country']))
#convert array into dataframe ,taken values from  names from country
df = pd.concat([df, one_hot_df], axis=1)

#axis=1: This means concatenate along the columns (side by side)
#. If axis=0 were used, it would concatenate along the rows (stacking vertically).

print("\nDataFrame after One-Hot Encoding for 'Country':")
print(df)




Original DataFrame:
      Name  Gender  Country
0    Alice  Female      USA
1      Bob    Male       UK
2  Charlie    Male  Germany
3    David    Male   France
4   Edward  Female      USA

DataFrame after Label Encoding for 'Gender':
      Name  Gender  Country  Gender_LabelEncoded
0    Alice  Female      USA                    0
1      Bob    Male       UK                    1
2  Charlie    Male  Germany                    1
3    David    Male   France                    1
4   Edward  Female      USA                    0

DataFrame after One-Hot Encoding for 'Country':
      Name  Gender  Country  Gender_LabelEncoded  Country_France  \
0    Alice  Female      USA                    0             0.0   
1      Bob    Male       UK                    1             0.0   
2  Charlie    Male  Germany                    1             0.0   
3    David    Male   France                    1             1.0   
4   Edward  Female      USA                    0             0.0   

   Country_Ger

In [11]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.datasets import load_iris

# Load dataset (Iris dataset for this example)
data = load_iris()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)

# Splitting data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42)
#random_state= 42 means seed value for random number generator , 
#Every time you run the code, you'll get the same train-test split if you use the same random_state.
# FEATURE SCALING
# Standard Scaler (Z-score normalization)
standard_scaler = StandardScaler()
X_train_standard_scaled = standard_scaler.fit_transform(X_train)
X_test_standard_scaled = standard_scaler.transform(X_test)
# Showing results
print("Original feature set shape:", X_train.shape)
print("Selected feature set shape after feature selection:", X_train_selected.shape)




Original feature set shape: (105, 4)
Selected feature set shape after feature selection: (105, 2)


In [8]:
# Min-Max Scaler (Scaling data between 0 and 1)
minmax_scaler = MinMaxScaler()
X_train_minmax_scaled = minmax_scaler.fit_transform(X_train)
X_test_minmax_scaled = minmax_scaler.transform(X_test)

# FEATURE SELECTION

# Using SelectKBest for feature selection (keeping top 2 features)
selector = SelectKBest(score_func=f_classif, k=2)
X_train_selected = selector.fit_transform(X_train_standard_scaled, y_train)
X_test_selected = selector.transform(X_test_standard_scaled)

# Showing results
print("Original feature set shape:", X_train.shape)
print("Selected feature set shape after feature selection:", X_train_selected.shape)

# You can also check which features were selected:
selected_features = selector.get_support(indices=True)
print("Selected feature indices:", selected_features)

Original feature set shape: (105, 4)
Selected feature set shape after feature selection: (105, 2)
Selected feature indices: [2 3]


In [1]:
pwd

'C:\\Users\\shalini singh'