In [3]:
# Importing necessary libraries
import pandas as pd
import numpy as np

# Sample DataFrame creation for demonstration
data = {
    'A': [1, 2, np.nan, 4, 5],
    'B': [5, 4, 3, np.nan, 1],
    'C': ['foo', 'bar', 'baz', 'foo', 'bar']
}
df = pd.DataFrame(data)
print("Initial DataFrame:")
print(df)



Initial DataFrame:
     A    B    C
0  1.0  5.0  foo
1  2.0  4.0  bar
2  NaN  3.0  baz
3  4.0  NaN  foo
4  5.0  1.0  bar


In [4]:

# 1. Data loading
# Loading a CSV file (example: 'data.csv')
# df = pd.read_csv('data.csv')



In [5]:
# 2. Data cleaning - handling missing data
# Fill missing values with the mean of the column
df_filled = df.fillna(df.mean(numeric_only=True))
print("\nDataFrame after filling missing values:")
print(df_filled)




DataFrame after filling missing values:
     A     B    C
0  1.0  5.00  foo
1  2.0  4.00  bar
2  3.0  3.00  baz
3  4.0  3.25  foo
4  5.0  1.00  bar


In [6]:
# 3. Dealing with missing data - drop rows with NaN values
df_dropped = df.dropna()
print("\nDataFrame after dropping rows with NaN:")
print(df_dropped)


DataFrame after dropping rows with NaN:
     A    B    C
0  1.0  5.0  foo
1  2.0  4.0  bar
4  5.0  1.0  bar


In [8]:
# 4. Removing duplicates
df['D'] = [1, 2, 2, 1, 3]  # Adding a new column for demonstration
df_no_duplicates = df.drop_duplicates()
print("\nDataFrame after removing duplicates:")
print(df_no_duplicates)




DataFrame after removing duplicates:
     A    B    C  D
0  1.0  5.0  foo  1
1  2.0  4.0  bar  2
2  NaN  3.0  baz  2
3  4.0  NaN  foo  1
4  5.0  1.0  bar  3


In [9]:
# 5. Slicing and dicing
sliced_df = df.loc[1:3, ['A', 'C']]
print("\nSliced DataFrame:")
print(sliced_df)




Sliced DataFrame:
     A    C
1  2.0  bar
2  NaN  baz
3  4.0  foo


In [10]:
# 6. Filtering and selecting data
filtered_df = df[df['A'] > 2]
print("\nFiltered DataFrame where A > 2:")
print(filtered_df)




Filtered DataFrame where A > 2:
     A    B    C  D
3  4.0  NaN  foo  1
4  5.0  1.0  bar  3


In [11]:
# 7. Concatenating and transforming
new_data = pd.DataFrame({
    'A': [10, 11],
    'B': [15, 16],
    'C': ['new1', 'new2']
})
concatenated_df = pd.concat([df, new_data], ignore_index=True)
print("\nConcatenated DataFrame:")
print(concatenated_df)




Concatenated DataFrame:
      A     B     C    D
0   1.0   5.0   foo  1.0
1   2.0   4.0   bar  2.0
2   NaN   3.0   baz  2.0
3   4.0   NaN   foo  1.0
4   5.0   1.0   bar  3.0
5  10.0  15.0  new1  NaN
6  11.0  16.0  new2  NaN


In [12]:
# 8. Adding new cases and variables
df['E'] = df['A'] * 2  # Adding a new variable
print("\nDataFrame after adding a new column 'E':")
print(df)




DataFrame after adding a new column 'E':
     A    B    C  D     E
0  1.0  5.0  foo  1   2.0
1  2.0  4.0  bar  2   4.0
2  NaN  3.0  baz  2   NaN
3  4.0  NaN  foo  1   8.0
4  5.0  1.0  bar  3  10.0


In [13]:
# 9. Removing data
df_removed = df.drop(columns=['B'])  # Removing column 'B'
print("\nDataFrame after removing column 'B':")
print(df_removed)




DataFrame after removing column 'B':
     A    C  D     E
0  1.0  foo  1   2.0
1  2.0  bar  2   4.0
2  NaN  baz  2   NaN
3  4.0  foo  1   8.0
4  5.0  bar  3  10.0


In [14]:
# 10. Sorting and shuffling
df_sorted = df.sort_values(by='A')
print("\nDataFrame sorted by column 'A':")
print(df_sorted)

df_shuffled = df.sample(frac=1).reset_index(drop=True)  # Shuffling rows
print("\nShuffled DataFrame:")
print(df_shuffled)




DataFrame sorted by column 'A':
     A    B    C  D     E
0  1.0  5.0  foo  1   2.0
1  2.0  4.0  bar  2   4.0
3  4.0  NaN  foo  1   8.0
4  5.0  1.0  bar  3  10.0
2  NaN  3.0  baz  2   NaN

Shuffled DataFrame:
     A    B    C  D     E
0  NaN  3.0  baz  2   NaN
1  2.0  4.0  bar  2   4.0
2  5.0  1.0  bar  3  10.0
3  1.0  5.0  foo  1   2.0
4  4.0  NaN  foo  1   8.0


In [15]:
# 11. Aggregating data
aggregated = df.groupby('C').agg({'A': 'mean', 'B': 'sum'})
print("\nAggregated DataFrame:")
print(aggregated)




Aggregated DataFrame:
       A    B
C            
bar  3.5  5.0
baz  NaN  3.0
foo  2.5  5.0


In [16]:
# 12. Handling outliers
# Replacing outliers in column 'A' with the median
q_low = df['A'].quantile(0.25)
q_high = df['A'].quantile(0.75)
iqr = q_high - q_low
outlier_condition = (df['A'] < (q_low - 1.5 * iqr)) | (df['A'] > (q_high + 1.5 * iqr))
df['A'] = np.where(outlier_condition, df['A'].median(), df['A'])
print("\nDataFrame after handling outliers in column 'A':")
print(df)




DataFrame after handling outliers in column 'A':
     A    B    C  D     E
0  1.0  5.0  foo  1   2.0
1  2.0  4.0  bar  2   4.0
2  NaN  3.0  baz  2   NaN
3  4.0  NaN  foo  1   8.0
4  5.0  1.0  bar  3  10.0


In [17]:
# 13. Data wrangling
# Pivot table example
pivot_df = df.pivot_table(values='A', index='C', aggfunc='sum')
print("\nPivot table:")
print(pivot_df)




Pivot table:
       A
C       
bar  7.0
baz  0.0
foo  5.0


In [18]:
# 14. Data normalization
# Normalizing column 'A' using min-max scaling
df['A_normalized'] = (df['A'] - df['A'].min()) / (df['A'].max() - df['A'].min())
print("\nDataFrame after normalizing column 'A':")
print(df)



DataFrame after normalizing column 'A':
     A    B    C  D     E  A_normalized
0  1.0  5.0  foo  1   2.0          0.00
1  2.0  4.0  bar  2   4.0          0.25
2  NaN  3.0  baz  2   NaN           NaN
3  4.0  NaN  foo  1   8.0          0.75
4  5.0  1.0  bar  3  10.0          1.00
