# Tasks for Pandas

## 1. Install Pandas and check its version

In [1]:
pip install pandas


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [6]:
import pandas as pd

# Check the version of Pandas

In [7]:
print(pd.__version__)

2.2.3


## 2. Create a dataframe manually. Check the dataframe using .head() and .tail().

In [8]:
data = {
    'Name': ['EFRON', 'SNEHASIS', 'OM', 'RAJESH', 'RAGHU'],
    'Age': [24, 27, 22, 29,32],
    'City': ['BANGALORE', 'SAKHIGOPAL', 'KHORDHA', 'BERHAMPUR', 'JATANI']
}

df = pd.DataFrame(data)

In [9]:
# Display the first 5 rows
print(df.head())

# Display the last 5 rows
print(df.tail())

       Name  Age        City
0     EFRON   24   BANGALORE
1  SNEHASIS   27  SAKHIGOPAL
2        OM   22     KHORDHA
3    RAJESH   29   BERHAMPUR
4     RAGHU   32      JATANI
       Name  Age        City
0     EFRON   24   BANGALORE
1  SNEHASIS   27  SAKHIGOPAL
2        OM   22     KHORDHA
3    RAJESH   29   BERHAMPUR
4     RAGHU   32      JATANI


## 3. Show the attributes of panda dataframe.

In [10]:
# Show the attributes of the DataFrame
print("Columns:", df.columns)
print("Index:", df.index)
print("Data Types:", df.dtypes)
print("Shape:", df.shape)

Columns: Index(['Name', 'Age', 'City'], dtype='object')
Index: RangeIndex(start=0, stop=5, step=1)
Data Types: Name    object
Age      int64
City    object
dtype: object
Shape: (5, 3)


## 4.Renaming the indexes of dataframe.

In [11]:
# Renaming the indexes
df.rename(index={0: 'A', 1: 'B', 2: 'C', 3: 'D', 4: 'E'}, inplace=True)
print(df)

       Name  Age        City
A     EFRON   24   BANGALORE
B  SNEHASIS   27  SAKHIGOPAL
C        OM   22     KHORDHA
D    RAJESH   29   BERHAMPUR
E     RAGHU   32      JATANI


## 5.Renaming headers of dataframe.

In [12]:
# Renaming the headers
df.rename(columns={'Name': 'Full Name', 'Age': 'Years', 'City': 'Location'}, inplace=True)
print(df)

  Full Name  Years    Location
A     EFRON     24   BANGALORE
B  SNEHASIS     27  SAKHIGOPAL
C        OM     22     KHORDHA
D    RAJESH     29   BERHAMPUR
E     RAGHU     32      JATANI


## 6. Load data from CSV file / Excel file / web site for creating a dataframe.

In [13]:
# Load data from the specified CSV file
file_path = r'C:\Users\davul\OneDrive\Desktop\2ndsem\datavis\people_data.csv'
df = pd.read_csv(file_path)

# Display the DataFrame
print(df)

       Name  Years        City
0     EFRON     24   BANGALORE
1  SNEHASIS     27  SAKHIGOPAL
2        OM     22     KHORDHA
3    RAJESH     29   BERHAMPUR
4     RAGHU     32      JATANI


## 7. Handling of missing values in Pandas dataframe

In [14]:
# Introduce some missing values
df.loc[1, 'Age'] = None

# Handling missing values
df.fillna(df.mean(numeric_only=True), inplace=True)  # Fill with mean for numeric columns
print(df)

       Name  Years        City   Age
0     EFRON     24   BANGALORE   NaN
1  SNEHASIS     27  SAKHIGOPAL  None
2        OM     22     KHORDHA   NaN
3    RAJESH     29   BERHAMPUR   NaN
4     RAGHU     32      JATANI   NaN


## 8. Sorting, Slicing and Extracting Data in pandas

In [15]:
# Sorting the DataFrame by Age
sorted_df = df.sort_values(by='Years')
print(sorted_df)

# Slicing the DataFrame
sliced_df = df.iloc[0:2]  # First two rows
print(sliced_df)

       Name  Years        City   Age
2        OM     22     KHORDHA   NaN
0     EFRON     24   BANGALORE   NaN
1  SNEHASIS     27  SAKHIGOPAL  None
3    RAJESH     29   BERHAMPUR   NaN
4     RAGHU     32      JATANI   NaN
       Name  Years        City   Age
0     EFRON     24   BANGALORE   NaN
1  SNEHASIS     27  SAKHIGOPAL  None


## 9. Filtering dataframe using conditions.

In [16]:
# Filtering the DataFrame for Age greater than 25
filtered_df = df[df['Years'] > 25]
print(filtered_df)

       Name  Years        City   Age
1  SNEHASIS     27  SAKHIGOPAL  None
3    RAJESH     29   BERHAMPUR   NaN
4     RAGHU     32      JATANI   NaN


## 10. Isolating columns and rows.

In [17]:
# Load data from the specified CSV file
file_path = r'C:\Users\davul\OneDrive\Desktop\2ndsem\datavis\people_data.csv'
df = pd.read_csv(file_path)

# Renaming the 'Age' column to 'Years'
df.rename(columns={'Age': 'Years'}, inplace=True)

# Isolating the 'Years' column
years_column = df['Years']
print("Years Column:")
print(years_column)

# Isolating a row using integer index
row_1 = df.loc[1]  # Accessing the second row (index 1)
print("\nRow at index 1:")
print(row_1)

Years Column:
0    24
1    27
2    22
3    29
4    32
Name: Years, dtype: int64

Row at index 1:
Name       SNEHASIS
Years            27
City     SAKHIGOPAL
Name: 1, dtype: object


## 11. Data cleaning in Pandas (Empty cells, Data in wrong format, Wrong data, Duplicates)


In [18]:
# Removing duplicates
df = df.drop_duplicates()

# Checking for empty cells
print(df.isnull().sum())

# Convert data types if necessary
df['Years'] = df['Years'].astype(int)  # Ensure 'Years' is an integer

Name     0
Years    0
City     0
dtype: int64


## 12. Data analysis in Pandas.

In [19]:
# Basic statistics
print(df.describe())

           Years
count   5.000000
mean   26.800000
std     3.962323
min    22.000000
25%    24.000000
50%    27.000000
75%    29.000000
max    32.000000


## 13. Pandas - Data Correlations


In [20]:
# Compute the correlation matrix using only numeric columns
correlation_matrix = df[['Years']].corr()  # Only 'Years' is numeric
print("Correlation Matrix:")
print(correlation_matrix)

Correlation Matrix:
       Years
Years    1.0


## 14. Groupby functions in Pandas.

In [21]:
# Grouping by City and calculating the mean age
grouped_df = df.groupby('City')['Years'].mean()
print(grouped_df)

City
BANGALORE     24.0
BERHAMPUR     29.0
JATANI        32.0
KHORDHA       22.0
SAKHIGOPAL    27.0
Name: Years, dtype: float64


## 15. Different types of join in Pandas.

In [22]:
import pandas as pd

# Creating the first DataFrame
data1 = {
    'Name': ['EFRON', 'SNEHASIS', 'OM', 'RAJESH', 'RAGHU'],
    'Years': [24, 27, 22, 29, 32],
    'City': ['BANGALORE', 'SAKHIGOPAL', 'KHORDHA', 'BERHAMPUR', 'JATANI']
}
df1 = pd.DataFrame(data1)

# Creating the second DataFrame
data2 = {
    'Name': ['EFRON', 'OM', 'RAJESH', 'ANIL'],
    'Salary': [50000, 60000, 70000, 80000]
}
df2 = pd.DataFrame(data2)

# Inner Join
inner_join = pd.merge(df1, df2, on='Name', how='inner')
print("Inner Join:")
print(inner_join)

# Outer Join
outer_join = pd.merge(df1, df2, on='Name', how='outer')
print("\nOuter Join:")
print(outer_join)

# Left Join
left_join = pd.merge(df1, df2, on='Name', how='left')
print("\nLeft Join:")
print(left_join)

# Right Join
right_join = pd.merge(df1, df2, on='Name', how='right')
print("\nRight Join:")
print(right_join)

Inner Join:
     Name  Years       City  Salary
0   EFRON     24  BANGALORE   50000
1      OM     22    KHORDHA   60000
2  RAJESH     29  BERHAMPUR   70000

Outer Join:
       Name  Years        City   Salary
0      ANIL    NaN         NaN  80000.0
1     EFRON   24.0   BANGALORE  50000.0
2        OM   22.0     KHORDHA  60000.0
3     RAGHU   32.0      JATANI      NaN
4    RAJESH   29.0   BERHAMPUR  70000.0
5  SNEHASIS   27.0  SAKHIGOPAL      NaN

Left Join:
       Name  Years        City   Salary
0     EFRON     24   BANGALORE  50000.0
1  SNEHASIS     27  SAKHIGOPAL      NaN
2        OM     22     KHORDHA  60000.0
3    RAJESH     29   BERHAMPUR  70000.0
4     RAGHU     32      JATANI      NaN

Right Join:
     Name  Years       City  Salary
0   EFRON   24.0  BANGALORE   50000
1      OM   22.0    KHORDHA   60000
2  RAJESH   29.0  BERHAMPUR   70000
3    ANIL    NaN        NaN   80000
