# Data Science Libraries Assignment 2

In [None]:
# Installing libraries

! pip install pandas
! pip install matplotlib
! pip install seaborn

In [None]:

# Importing libraries

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Section 1

# Task 1: Setup and DataFrame Creation

In [None]:
#Create a dataframe from a dictionary

dict = {
    'Name' : ['Jordan', 'Cody', 'Anthony', 'David' ],
    'Country' : ['England', 'Netherlands', 'France', 'Spain'],
    'Age' : [29, 23, 29, 27]
}
df1 = pd.DataFrame(dict)
df1

In [None]:
#Create a dataframe from a list of dictionaries

list = [
    {'ID' : 100, 'Department' : 'Sales'},
    {'ID' : 200, 'Department' : 'HR'}
    ]
df2 = pd.DataFrame(list)
df2

In [None]:
#Create a dataframe from a CSV file

df = pd.read_csv(r'C:\Users\Atankabadi\Desktop\AI Capacity Building Assignments\data-science-assignment-2-manishdahal\messed_dataset.csv')
df

# Task 2: Viewing and Inspecting Data

In [None]:
# Display the first few rows of the DataFrame

print('The first 5 rows of the DataFrame :')
df.head()


In [None]:
# Display the last few rows of the DataFrame

print('The last 5 rows of the DataFrame :')
df.tail()

In [None]:
# Get a summary of the DataFrame, including basic statistics and data types

print('Summary of the DataFrame :')
df.info()

In [None]:
# Display the shape of the DataFrame

print('Shape of the DataFrame :')
df.shape

In [None]:
# Display the shape of the DataFrame

print('Columns of the DataFrame :')
df.columns.tolist()

# Task 3: Selection and Indexing


In [None]:
# Select a single column
print('Selecting the name column only :')
df['Name']

In [None]:
# Select multiple columns
print('Selecting the name and age column :')
df[['Name', 'Age']]

In [None]:
# Select a single row by using index

print('Selecting first row by index')
df.iloc[0]

In [None]:
# Select multiple rows by using index

print('Selecting first three rows by index')
df.iloc[0:3]                    #using slicing

In [None]:
# Select specific rows and columns using loc and iloc

print('Selecting first three rows with columns name Country and Salary')
df.loc[0:2,['Country', 'Salary']]               #label based indexing

## Task 4: Handling Missing Data


In [None]:
# Identify missing values in the DataFrame

df.isnull().sum()           #calculate the sum of missing value in each column

In [None]:
# Drop rows with missing values

df_dropped = df.dropna()            #create a new dataframe where all the rows with missing values are dropped
df_dropped

In [None]:
# Fill missing values with a specified value

fill_dict = {                           #Create a dictionary consisting of the data to be inserted in missing fields
    'Name' : 'John',
    'Age' : '50',
    'Email' : 'john@missing.com',
    'Country' : 'Nepal',
    'Salary' : '50000',
    'DateOfJoining' : '2024-07-09'
}
df.fillna(value=fill_dict, inplace= True)       #Fill the missing value in  the data frame(existing dataframe)
df

## Task 5: Data Operations

In [None]:
# Add a new column to the DataFrame

df['Planet'] = 'Earth'     # Adding a new column called Planet with value Earth for all rows
df

In [None]:
# Delete a column from the DataFrame

df.drop(columns= ['Planet'], inplace=True)          #Deleting the column Planet from the DataFrame
df

In [None]:
# Rename columns in the DataFrame

df.rename(columns= {'DateOfJoining' : 'Date Of Birth'})     #Rename the column DateOfJoining to Date Of Birth

In [None]:
# Apply a function to a column

df['Name'] = df['Name'].astype(str)             #Converting the Name filed to string explicitly
df['Name'] = df['Name'].apply(lambda x : 'Mr. ' + x)        #Adding Mr before all the value under Name column using lambda function
df

## Task 6: GroupBy Operations


In [None]:
# Group the DataFrame by a column and calculate summary statistics.

new_dict = {
    'Name': ['John', 'Donny', 'Marcus', 'Bruno', 'John', 'Donny', 'Marcus', 'Bruno'],
    'Salary': [40000, 30000, 70000, 20000, 70000, 50000, 10000, 110000]
}
new_df = pd.DataFrame(new_dict)             #Creating a new DataFrame using a dictionary
grouped = new_df.groupby('Name')           # Group by 'Name'

In [None]:
# Iterate over groups and display the group names and data.

mean = grouped['Salary'].mean()         #Calculating the mean salary of each person
mean

## Task 7: Merging and Joining DataFrames


In [None]:
# Merge two DataFrames on a common column.

df1 = pd.DataFrame({
    'ID': [1, 2, 3, 4],
    'Name': ['A', 'B', 'C', 'D']
})

df2 = pd.DataFrame({
    'ID': [3, 4, 5, 6],
    'Age': [22, 33, 44, 55]
})

merged_df = pd.merge(df1, df2, on='ID')            # Merginf two DataFrames using common column 'ID'
merged_df

In [None]:
# Join two DataFrames using their indices.

df1 = pd.DataFrame({
    'Name': ['A', 'B', 'C', 'D']
}, index=[1, 2, 3, 4])

df2 = pd.DataFrame({
    'Height': [150, 155, 160, 170]
}, index=[3, 4, 5, 6])

joined_df = df1.join(df2, how='inner')              # Joining DataFrames using their indices
joined_df


## Task 8: Working with Dates and Times


In [None]:
# Create a datetime index for the DataFrame.

df1['Date'] = ['2000-01-02', '2001-03-04', '2003-05-06', '2003-07-08']
df1['Date'] = pd.to_datetime(df1['Date'])                   #Converting to date time


In [None]:
# Convert a column to datetime and extract date components.

df1['Year'] = df1['Date'].dt.year
df1['Month'] = df1['Date'].dt.month
df1['Day'] = df1['Date'].dt.day
df1

## Task 9: Input and Output


In [None]:
# Read data from a CSV file into a DataFrame.
df = pd.read_csv(r'C:\Users\Atankabadi\Desktop\AI Capacity Building Assignments\data-science-assignment-2-manishdahal\messed_dataset.csv')

print("DataFrame read from CSV:")
df


In [None]:
# Write the DataFrame to a CSV file.

df1.to_csv('DataFrameToCSV.csv', index=False)

In [None]:
#Installing openpyxl library for reading and writing Excel(.xlsx) files

!pip install openpyxl

In [None]:
# Read data from an Excel file into a DataFrame. The data given is `SaleData.xlsl`.

df_excel = pd.read_excel('SaleData.xlsx')

df_excel

In [None]:
# Write the DataFrame to an Excel file.


df1.to_excel('DataFrameToExcel.xlsx', index=False)

# Section 2: Reference to Friday Class on EDA


## Task 10: Visualization


In [None]:
# Create a simple plot using Matplotlib.

# Sample data
x = [1, 2, 3, 4, 5]
y = [2, 3, 5, 7, 11]

plt.plot(x,y)
plt.xlabel('X-axis')
plt.ylabel('Y-axis')
plt.title('Simple Plot using Matplotlib')
plt.show()

In [None]:
# Create a bar plot using Seaborn.

# Sample Data
categories = ['A', 'B', 'C']
values = [10, 20, 15]

sns.barplot(x=categories, y=values)     # Create a bar plot using Seaborn

plt.show()
