In [1]:
import pandas as pd

# Example DataFrame with missing values
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
    'Age': [24, 27, None, 32, 28],
    'City': ['New York', None, 'Chicago', 'Houston', 'Phoenix'],
    'Salary': [50000, 60000, 45000, None, 65000]
}

df = pd.DataFrame(data)
print(df)

# Checking for missing values
print(df.isnull().sum())

      Name   Age      City   Salary
0    Alice  24.0  New York  50000.0
1      Bob  27.0      None  60000.0
2  Charlie   NaN   Chicago  45000.0
3    David  32.0   Houston      NaN
4      Eva  28.0   Phoenix  65000.0
Name      0
Age       1
City      1
Salary    1
dtype: int64


In [2]:
# Dropping rows with any missing values
df_dropped_rows = df.dropna()
print(df_dropped_rows)

# Dropping columns with any missing values
df_dropped_columns = df.dropna(axis=1)
print(df_dropped_columns)

    Name   Age      City   Salary
0  Alice  24.0  New York  50000.0
4    Eva  28.0   Phoenix  65000.0
      Name
0    Alice
1      Bob
2  Charlie
3    David
4      Eva


In [3]:
# Filling missing values with a specified value
df_filled_value = df.fillna('Unknown')
print(df_filled_value)

# Filling missing values with the mean of the column
df['Age'].fillna(df['Age'].mean(), inplace=True)
df['Salary'].fillna(df['Salary'].mean(), inplace=True)
df['City'].fillna(df['City'].mode()[0], inplace=True)
print(df)

      Name      Age      City   Salary
0    Alice     24.0  New York  50000.0
1      Bob     27.0   Unknown  60000.0
2  Charlie  Unknown   Chicago  45000.0
3    David     32.0   Houston  Unknown
4      Eva     28.0   Phoenix  65000.0
      Name    Age      City   Salary
0    Alice  24.00  New York  50000.0
1      Bob  27.00   Chicago  60000.0
2  Charlie  27.75   Chicago  45000.0
3    David  32.00   Houston  55000.0
4      Eva  28.00   Phoenix  65000.0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Salary'].fillna(df['Salary'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are sett

In [4]:
# Example DataFrame with duplicates
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva', 'Alice'],
    'Age': [24, 27, 22, 32, 28, 24],
    'City': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix', 'New York']
}

df = pd.DataFrame(data)
print(df)

# Checking for duplicate rows
print(df.duplicated())

      Name  Age         City
0    Alice   24     New York
1      Bob   27  Los Angeles
2  Charlie   22      Chicago
3    David   32      Houston
4      Eva   28      Phoenix
5    Alice   24     New York
0    False
1    False
2    False
3    False
4    False
5     True
dtype: bool


In [5]:
# Removing duplicate rows
df_no_duplicates = df.drop_duplicates()
print(df_no_duplicates)

      Name  Age         City
0    Alice   24     New York
1      Bob   27  Los Angeles
2  Charlie   22      Chicago
3    David   32      Houston
4      Eva   28      Phoenix


In [6]:
# Example DataFrame with mixed data types
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
    'Age': ['24', '27', '22', '32', '28'],  # Age as string
    'Salary': [50000, 60000, 45000, 70000, 65000]
}

df = pd.DataFrame(data)
print(df.dtypes)

# Converting Age from string to integer
df['Age'] = df['Age'].astype(int)
print(df.dtypes)

# Converting Salary from integer to float
df['Salary'] = df['Salary'].astype(float)
print(df.dtypes)


Name      object
Age       object
Salary     int64
dtype: object
Name      object
Age        int64
Salary     int64
dtype: object
Name       object
Age         int64
Salary    float64
dtype: object


In [7]:
import pandas as pd

# Creating a messy DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva', 'Alice'],
    'Age': ['24', '27', None, '32', '28', '24'],
    'City': ['New York', None, 'Chicago', 'Houston', 'Phoenix', 'New York'],
    'Salary': [50000, 60000, 45000, None, 65000, 50000]
}

df = pd.DataFrame(data)
print("Initial DataFrame:")
print(df)


Initial DataFrame:
      Name   Age      City   Salary
0    Alice    24  New York  50000.0
1      Bob    27      None  60000.0
2  Charlie  None   Chicago  45000.0
3    David    32   Houston      NaN
4      Eva    28   Phoenix  65000.0
5    Alice    24  New York  50000.0


In [10]:
# Convert 'Age' column to numeric, forcing errors to NaN
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')

# Fill missing values in 'Age' with the mean of the column
df['Age'].fillna(df['Age'].mean(), inplace=True)

# Fill missing values in 'Salary' with the mean of the column
df['Salary'].fillna(df['Salary'].mean(), inplace=True)

# Fill missing values in 'City' with 'Unknown'
df['City'].fillna('Unknown', inplace=True)

# Display the DataFrame
print(df)

      Name   Age      City   Salary
0    Alice  24.0  New York  50000.0
1      Bob  27.0   Unknown  60000.0
2  Charlie  27.0   Chicago  45000.0
3    David  32.0   Houston  54000.0
4      Eva  28.0   Phoenix  65000.0
5    Alice  24.0  New York  50000.0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Salary'].fillna(df['Salary'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are sett

In [11]:
# Removing duplicate rows
df.drop_duplicates(inplace=True)

print("\nDataFrame after removing duplicates:")
print(df)


DataFrame after removing duplicates:
      Name   Age      City   Salary
0    Alice  24.0  New York  50000.0
1      Bob  27.0   Unknown  60000.0
2  Charlie  27.0   Chicago  45000.0
3    David  32.0   Houston  54000.0
4      Eva  28.0   Phoenix  65000.0


In [12]:
# Converting data types
df['Age'] = df['Age'].astype(float)
df['Salary'] = df['Salary'].astype(float)
print("\nDataFrame after converting data types:")
print(df)
print(df.dtypes)


DataFrame after converting data types:
      Name   Age      City   Salary
0    Alice  24.0  New York  50000.0
1      Bob  27.0   Unknown  60000.0
2  Charlie  27.0   Chicago  45000.0
3    David  32.0   Houston  54000.0
4      Eva  28.0   Phoenix  65000.0
Name       object
Age       float64
City       object
Salary    float64
dtype: object


In [13]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Example DataFrame
data = {
    'Feature1': [10, 20, 30, 40, 50],
    'Feature2': [100, 200, 300, 400, 500]
}

df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)

# Applying Min-Max Scaling
scaler = MinMaxScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
print("\nScaled DataFrame:")
print(df_scaled)


Original DataFrame:
   Feature1  Feature2
0        10       100
1        20       200
2        30       300
3        40       400
4        50       500

Scaled DataFrame:
   Feature1  Feature2
0      0.00      0.00
1      0.25      0.25
2      0.50      0.50
3      0.75      0.75
4      1.00      1.00


In [14]:
from sklearn.preprocessing import StandardScaler

# Applying Z-score Normalization
scaler = StandardScaler()
df_normalized = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
print("\nNormalized DataFrame:")
print(df_normalized)



Normalized DataFrame:
   Feature1  Feature2
0 -1.414214 -1.414214
1 -0.707107 -0.707107
2  0.000000  0.000000
3  0.707107  0.707107
4  1.414214  1.414214


In [16]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

# Example DataFrame with categorical data
data = {
    'Color': ['Red', 'Green', 'Blue', 'Green', 'Red'],
    'Size': ['S', 'M', 'L', 'M', 'S']
}

df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)

# Applying One-Hot Encoding
encoder = OneHotEncoder(sparse_output=False)
encoded_data = encoder.fit_transform(df[['Color', 'Size']])
df_encoded = pd.DataFrame(encoded_data, 
                          columns=encoder.get_feature_names_out(['Color', 'Size']))
print("\nOne-Hot Encoded DataFrame:")
print(df_encoded)


Original DataFrame:
   Color Size
0    Red    S
1  Green    M
2   Blue    L
3  Green    M
4    Red    S

One-Hot Encoded DataFrame:
   Color_Blue  Color_Green  Color_Red  Size_L  Size_M  Size_S
0         0.0          0.0        1.0     0.0     0.0     1.0
1         0.0          1.0        0.0     0.0     1.0     0.0
2         1.0          0.0        0.0     1.0     0.0     0.0
3         0.0          1.0        0.0     0.0     1.0     0.0
4         0.0          0.0        1.0     0.0     0.0     1.0


In [17]:
from sklearn.preprocessing import LabelEncoder

# Applying Label Encoding
label_encoder = LabelEncoder()
df['Color'] = label_encoder.fit_transform(df['Color'])
df['Size'] = label_encoder.fit_transform(df['Size'])
print("\nLabel Encoded DataFrame:")
print(df)



Label Encoded DataFrame:
   Color  Size
0      2     2
1      1     1
2      0     0
3      1     1
4      2     2


In [18]:
import pandas as pd

# Creating a sample DataFrame
data = {
    'Age': [25, 45, 35, 50, 23],
    'Income': [50000, 100000, 75000, 120000, 40000],
    'Gender': ['Male', 'Female', 'Female', 'Male', 'Male'],
    'Purchased': ['No', 'Yes', 'Yes', 'No', 'Yes']
}

df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)


Original DataFrame:
   Age  Income  Gender Purchased
0   25   50000    Male        No
1   45  100000  Female       Yes
2   35   75000  Female       Yes
3   50  120000    Male        No
4   23   40000    Male       Yes


In [19]:
# Encoding categorical variables
df['Gender'] = label_encoder.fit_transform(df['Gender'])
df['Purchased'] = label_encoder.fit_transform(df['Purchased'])
print("\nDataFrame after Label Encoding:")
print(df)



DataFrame after Label Encoding:
   Age  Income  Gender  Purchased
0   25   50000       1          0
1   45  100000       0          1
2   35   75000       0          1
3   50  120000       1          0
4   23   40000       1          1


In [20]:
# Applying Min-Max Scaling
scaler = MinMaxScaler()
df[['Age', 'Income']] = scaler.fit_transform(df[['Age', 'Income']])
print("\nDataFrame after Scaling:")
print(df)



DataFrame after Scaling:
        Age  Income  Gender  Purchased
0  0.074074  0.1250       1          0
1  0.814815  0.7500       0          1
2  0.444444  0.4375       0          1
3  1.000000  1.0000       1          0
4  0.000000  0.0000       1          1


In [21]:
print("\nFinal Prepared DataFrame:")
print(df)


Final Prepared DataFrame:
        Age  Income  Gender  Purchased
0  0.074074  0.1250       1          0
1  0.814815  0.7500       0          1
2  0.444444  0.4375       0          1
3  1.000000  1.0000       1          0
4  0.000000  0.0000       1          1
