# Data Analysis Operations in Python using Pandas

In [21]:
import pandas as pd
import numpy as np

## Reading Data from Different Formats

In [22]:
csv_data = pd.read_csv('Data.csv')  

# Reading an Excel file
#excel_data = pd.read_excel('sample_data.xlsx')  

## Finding the Shape of the Data,Null values

In [23]:
data = csv_data

# b) Find Shape of Data
shape_of_data = data.shape
print(f"Shape of the data: {shape_of_data}")

# c) Find Missing Values
missing_values = data.isnull().sum()
print(f"Missing values per column:\n{missing_values}")

Shape of the data: (10, 4)
Missing values per column:
Country      0
Age          1
Salary       1
Purchased    0
dtype: int64


## Finding the Data Type of Each Column
## Finding the Zero in each Column

In [24]:
# d) Find data type of each column
data_types = data.dtypes
print(f"Data types of columns:\n{data_types}")

# e) Finding out Zeros
zero_counts = (data == 0).sum()
print(f"Number of zeros in each column:\n{zero_counts}")

Data types of columns:
Country       object
Age          float64
Salary       float64
Purchased     object
dtype: object
Number of zeros in each column:
Country      0
Age          0
Salary       0
Purchased    0
dtype: int64


## Indexing, Selecting, and Sorting Data

In [25]:
selected_data = data.iloc[:5, :3]  # First 5 rows, first 3 columns
print(f"Selected Data:\n{selected_data}")


Selected Data:
   Country   Age   Salary
0   France  44.0  72000.0
1    Spain  27.0  48000.0
2  Germany  30.0  54000.0
3    Spain  38.0  61000.0
4  Germany  40.0      NaN


In [26]:
sorted_data = data.sort_values(by='Age', ascending=True)
print(f"Data sorted by 'Age':\n{sorted_data.head()}")


Data sorted by 'Age':
   Country   Age   Salary Purchased
1    Spain  27.0  48000.0       Yes
2  Germany  30.0  54000.0        No
5   France  35.0  58000.0       Yes
9   France  37.0  67000.0       Yes
3    Spain  38.0  61000.0        No


## Counting Unique Values in Data

In [27]:
unique_counts = data.nunique()
print(f"Unique value counts for each column:\n{unique_counts}")


Unique value counts for each column:
Country      3
Age          9
Salary       9
Purchased    2
dtype: int64


## Formatting and Converting Variable Data Types

In [28]:
df = pd.DataFrame(data)
df['Salary'] = df['Salary'].apply(lambda x: f"${x:,.2f}")
print("Formatted 'Salary' column as currency:")
print(df['Salary'])




Formatted 'Salary' column as currency:
0    $72,000.00
1    $48,000.00
2    $54,000.00
3    $61,000.00
4          $nan
5    $58,000.00
6    $52,000.00
7    $79,000.00
8    $83,000.00
9    $67,000.00
Name: Salary, dtype: object


In [29]:
df['Purchased'] = df['Purchased'].map({'Yes': True, 'No': False})
print("Converted 'Purchased' column to boolean:")
print(df['Purchased'])


Converted 'Purchased' column to boolean:
0    False
1     True
2    False
3    False
4     True
5     True
6    False
7     True
8    False
9     True
Name: Purchased, dtype: bool


## Describing the Attributes of the Data

In [30]:
## Describing the Attributes of the Data
data_description = data.describe(include='all')  # Summary statistics
print(f"Description of data:\n{data_description}")


Description of data:
       Country        Age        Salary Purchased
count       10   9.000000      9.000000        10
unique       3        NaN           NaN         2
top     France        NaN           NaN        No
freq         4        NaN           NaN         5
mean       NaN  38.777778  63777.777778       NaN
std        NaN   7.693793  12265.579662       NaN
min        NaN  27.000000  48000.000000       NaN
25%        NaN  35.000000  54000.000000       NaN
50%        NaN  38.000000  61000.000000       NaN
75%        NaN  44.000000  72000.000000       NaN
max        NaN  50.000000  83000.000000       NaN
