In [2]:
import pandas as pd

df = pd.read_csv('Exploratory Data Analysis.csv')

print(df.head())

   Employee_ID           Name  Age Department   Salary Joining_Date  \
0          101       John Doe   28         IT  50000.0   15-06-2018   
1          102     Jane Smith   32         HR  60000.0   23-09-2016   
2          103    Emily Davis   45    Finance  80000.0   04-11-2012   
3          104  Michael Brown   29         IT  55000.0   12-07-2019   
4          105   Chris Wilson   35  Marketing  62000.0   19-03-2015   

            City  
0       New York  
1    Los Angeles  
2        Chicago  
3       New York  
4  San Francisco  


In [3]:
import pandas as pd

data = {'col1': [1, 2, 3, None],
        'col2': ['A', 'B', 'C', 'D'],
        'col3': [10.1, 20.2, None, 40.4]}
df = pd.DataFrame(data)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   col1    3 non-null      float64
 1   col2    4 non-null      object 
 2   col3    3 non-null      float64
dtypes: float64(2), object(1)
memory usage: 228.0+ bytes


In [4]:
import pandas as pd
import numpy as np

data = {'col1': [1, 2, np.nan, 4],
        'col2': [5, np.nan, 7, 8],
        'col3': [9, 10, 11, np.nan],
        'col4': [12, 13, 14, 15]}
df = pd.DataFrame(data)

missing_values_count = df.isnull().sum()

print(missing_values_count)

col1    1
col2    1
col3    1
col4    0
dtype: int64


In [5]:
import pandas as pd
import numpy as np

data = {'Name': ['Alice', 'Bob', np.nan, 'David'],
        'Salary': [70000, np.nan, 90000, 60000],
        'Department': ['HR', 'IT', 'HR', 'Finance']}
df = pd.DataFrame(data)

print("Original DataFrame:")
print(df)
print("-" * 30)

df['Name'].fillna('Unknown', inplace=True)

average_salary = df['Salary'].mean()
df['Salary'].fillna(average_salary, inplace=True)

print("DataFrame with missing values handled:")
print(df)

Original DataFrame:
    Name   Salary Department
0  Alice  70000.0         HR
1    Bob      NaN         IT
2    NaN  90000.0         HR
3  David  60000.0    Finance
------------------------------
DataFrame with missing values handled:
      Name        Salary Department
0    Alice  70000.000000         HR
1      Bob  73333.333333         IT
2  Unknown  90000.000000         HR
3    David  60000.000000    Finance


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Name'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Salary'].fillna(average_salary, inplace=True)


In [6]:
import pandas as pd

data = {'col1': ['A', 'B', 'A', 'C', 'B'],
        'col2': [1, 2, 1, 3, 2]}
df = pd.DataFrame(data)

duplicate_rows = df.duplicated()
print("Boolean Series indicating duplicate rows:")
print(duplicate_rows)

num_duplicates = df.duplicated().sum()
print(f"\nNumber of duplicate rows: {num_duplicates}")

duplicate_col1_col2 = df.duplicated(subset=['col1', 'col2'])
print("\nBoolean Series indicating duplicate rows based on 'col1' and 'col2':")
print(duplicate_col1_col2)

Boolean Series indicating duplicate rows:
0    False
1    False
2     True
3    False
4     True
dtype: bool

Number of duplicate rows: 2

Boolean Series indicating duplicate rows based on 'col1' and 'col2':
0    False
1    False
2     True
3    False
4     True
dtype: bool


In [7]:
import pandas as pd
import numpy as np

data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'Age': [25, -3, 35, -2, 42]
}
df = pd.DataFrame(data)

print("Original DataFrame:")
print(df)
print("\n" + "-"*30 + "\n")

valid_ages = df[df['Age'] >= 0]['Age']
average_age = valid_ages.mean()

print(f"Average age (ignoring negative values): {average_age:.2f}\n")

df['Age'] = df['Age'].apply(lambda x: average_age if x < 0 else x)

print("Corrected DataFrame:")
print(df)

Original DataFrame:
      Name  Age
0    Alice   25
1      Bob   -3
2  Charlie   35
3    David   -2
4      Eve   42

------------------------------

Average age (ignoring negative values): 34.00

Corrected DataFrame:
      Name   Age
0    Alice  25.0
1      Bob  34.0
2  Charlie  35.0
3    David  34.0
4      Eve  42.0


In [8]:
import pandas as pd

data = {'Age': [25, 30, 35, 40, 45, 50, 55, 60],
        'Salary': [50000, 60000, 70000, 80000, 90000, 100000, 110000, 120000],
        'Experience': [2, 5, 8, 10, 12, 15, 18, 20]}
df = pd.DataFrame(data)

summary_statistics = df.describe()

print(summary_statistics)

             Age         Salary  Experience
count   8.000000       8.000000    8.000000
mean   42.500000   85000.000000   11.250000
std    12.247449   24494.897428    6.250714
min    25.000000   50000.000000    2.000000
25%    33.750000   67500.000000    7.250000
50%    42.500000   85000.000000   11.000000
75%    51.250000  102500.000000   15.750000
max    60.000000  120000.000000   20.000000


In [10]:
import pandas as pd

data = {
    'EmployeeID': [1, 2, 3, 4, 5, 6],
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Frank'],
    'Department': ['HR', 'IT', 'HR', 'Sales', 'IT', 'Sales'],
    'Salary': [60000, 75000, 62000, 80000, 78000, 85000]
}
df = pd.DataFrame(data)
department_summary = df.groupby('Department').agg(
    EmployeeCount=('EmployeeID', 'count'),
    AverageSalary=('Salary', 'mean')
)
print(department_summary)

            EmployeeCount  AverageSalary
Department                              
HR                      2        61000.0
IT                      2        76500.0
Sales                   2        82500.0


In [15]:
import pandas as pd


df.to_csv('Exploratory Data Analysis.csv', index=False)