#### Pandas
Pandas is a powerful data manipulation library in Python, widely used for data analysis and data cleaning. It provides two primary data structures: Series and DataFrame. A Series is a one-dimensional array-like object, while a DataFrame is a two-dimensional, size-mutable, and potentially heterogeneous tabular data structure with labeled axes (rows and columns).

In [1]:
!pip install pandas 
import pandas 
import pandas as pd



In [2]:
## Series 
## A pandas series is one dimensional array like object that can hold any data type
## It is similar to a column in a spreadsheet or a SQL table
## The values in a pandas series are mutable (can be changed after creation)
## The axis labels are collectively referred to as the index

import pandas as pd
import pandas as pd
data = [10, 20, 30, 40, 50]
series = pd.Series(data)
print("Series \n",series)
print("Type of Series \n", type(series))

Series 
 0    10
1    20
2    30
3    40
4    50
dtype: int64
Type of Series 
 <class 'pandas.core.series.Series'>


In [3]:
# Create a Series from dictionary 
import pandas as pd
data = {'a': 1, 'b': 2, 'c': 3}
series = pd.Series(data)
print(series)

a    1
b    2
c    3
dtype: int64


In [4]:
# Index value assing
srno = pd.Series([10,20,30,40,50,60,70,80,90,100], index=[1,2,3,4,5,6,7,8,9,10])

In [5]:
srno

1      10
2      20
3      30
4      40
5      50
6      60
7      70
8      80
9      90
10    100
dtype: int64

In [10]:
import pandas as pd
data = [10,20,30]
index = ['a','b','c']

new = pd.Series(data,index) # Creating a new series using the data and index

In [11]:
new

a    10
b    20
c    30
dtype: int64

In [20]:
# Create the DataFrame 
import pandas as pd
data = {
    'Name':["Bhimrao","Samarth","Trisha","Swati"],
    'Age':[21,22,23,24],
    'City':["Pune","Mumbai","Delhi","Banglore"]
}

df = pd.DataFrame(data)
print(df)

      Name  Age      City
0  Bhimrao   21      Pune
1  Samarth   22    Mumbai
2   Trisha   23     Delhi
3    Swati   24  Banglore


In [21]:
import numpy as np
np.array(df)

array([['Bhimrao', 21, 'Pune'],
       ['Samarth', 22, 'Mumbai'],
       ['Trisha', 23, 'Delhi'],
       ['Swati', 24, 'Banglore']], dtype=object)

In [15]:
# Create a DataFrame from a list of dictionaries
data = [
    {'Name': 'John', 'Age': 25, 'City': 'New York'},
    { 'Name': 'Jane', 'Age': 30, 'City': 'London'},
    { 'Name': 'Bob', 'Age': 35, 'City': 'Paris'},
    { 'Name': 'Alice', 'Age': 40, 'City': 'Tokyo'},
    { 'Name': 'Mike', 'Age': 45, 'City': 'New York'},
    { 'Name': 'Sarah', 'Age': 50, 'City': 'London'},
    { 'Name': 'Tom', 'Age': 55, 'City': 'Paris'},
    { 'Name': 'Emily', 'Age': 60, 'City': 'Tokyo'},
    { 'Name': 'David', 'Age': 65, 'City': 'New York'},
    { 'Name': 'Jessica', 'Age': 70, 'City': 'London'}

    
]

df = pd.DataFrame(data)
print(df)

      Name  Age      City
0     John   25  New York
1     Jane   30    London
2      Bob   35     Paris
3    Alice   40     Tokyo
4     Mike   45  New York
5    Sarah   50    London
6      Tom   55     Paris
7    Emily   60     Tokyo
8    David   65  New York
9  Jessica   70    London


In [16]:
print(df.head(3))


   Name  Age      City
0  John   25  New York
1  Jane   30    London
2   Bob   35     Paris


In [23]:
df = pd.read_csv('sales_data.csv')

In [24]:
df.head()

Unnamed: 0,Transaction ID,Date,Product Category,Product Name,Units Sold,Unit Price,Total Revenue,Region,Payment Method
0,10001,01-01-2024,Electronics,iPhone 14 Pro,2,999.99,1999.98,North America,Credit Card
1,10002,02-01-2024,Home Appliances,Dyson V11 Vacuum,1,499.99,499.99,Europe,PayPal
2,10003,03-01-2024,Clothing,Levi's 501 Jeans,3,69.99,209.97,Asia,Debit Card
3,10004,04-01-2024,Books,The Da Vinci Code,4,15.99,63.96,North America,Credit Card
4,10005,05-01-2024,Beauty Products,Neutrogena Skincare Set,1,89.99,89.99,Europe,PayPal


In [26]:
df.tail()

Unnamed: 0,Transaction ID,Date,Product Category,Product Name,Units Sold,Unit Price,Total Revenue,Region,Payment Method
235,10236,23-08-2024,Home Appliances,Nespresso Vertuo Next Coffee and Espresso Maker,1,159.99,159.99,Europe,PayPal
236,10237,24-08-2024,Clothing,Nike Air Force 1 Sneakers,3,90.0,270.0,Asia,Debit Card
237,10238,25-08-2024,Books,The Handmaid's Tale by Margaret Atwood,3,10.99,32.97,North America,Credit Card
238,10239,26-08-2024,Beauty Products,Sunday Riley Luna Sleeping Night Oil,1,55.0,55.0,Europe,PayPal
239,10240,27-08-2024,Sports,Yeti Rambler 20 oz Tumbler,2,29.99,59.98,Asia,Credit Card


In [27]:
df.shape

(240, 9)

In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240 entries, 0 to 239
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Transaction ID    240 non-null    int64  
 1   Date              240 non-null    object 
 2   Product Category  240 non-null    object 
 3   Product Name      240 non-null    object 
 4   Units Sold        240 non-null    int64  
 5   Unit Price        240 non-null    float64
 6   Total Revenue     240 non-null    float64
 7   Region            240 non-null    object 
 8   Payment Method    240 non-null    object 
dtypes: float64(2), int64(2), object(5)
memory usage: 17.0+ KB


In [29]:
df.columns

Index(['Transaction ID', 'Date', 'Product Category', 'Product Name',
       'Units Sold', 'Unit Price', 'Total Revenue', 'Region',
       'Payment Method'],
      dtype='object')

In [30]:
df.describe()

Unnamed: 0,Transaction ID,Units Sold,Unit Price,Total Revenue
count,240.0,240.0,240.0,240.0
mean,10120.5,2.158333,236.395583,335.699375
std,69.42622,1.322454,429.446695,485.804469
min,10001.0,1.0,6.5,6.5
25%,10060.75,1.0,29.5,62.965
50%,10120.5,2.0,89.99,179.97
75%,10180.25,3.0,249.99,399.225
max,10240.0,10.0,3899.99,3899.99


In [31]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Transaction ID,240.0,10120.5,69.42622,10001.0,10060.75,10120.5,10180.25,10240.0
Units Sold,240.0,2.158333,1.322454,1.0,1.0,2.0,3.0,10.0
Unit Price,240.0,236.395583,429.446695,6.5,29.5,89.99,249.99,3899.99
Total Revenue,240.0,335.699375,485.804469,6.5,62.965,179.97,399.225,3899.99


In [32]:
df.duplicated().sum()

np.int64(0)

In [33]:
df.isnull().sum()

Transaction ID      0
Date                0
Product Category    0
Product Name        0
Units Sold          0
Unit Price          0
Total Revenue       0
Region              0
Payment Method      0
dtype: int64

In [34]:
df.isna().sum()

Transaction ID      0
Date                0
Product Category    0
Product Name        0
Units Sold          0
Unit Price          0
Total Revenue       0
Region              0
Payment Method      0
dtype: int64

In [36]:
## Assecing from Data Frame 
data

{'Name': ['Bhimrao', 'Samarth', 'Trisha', 'Swati'],
 'Age': [21, 22, 23, 24],
 'City': ['Pune', 'Mumbai', 'Delhi', 'Banglore']}

In [39]:
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,City
0,Bhimrao,21,Pune
1,Samarth,22,Mumbai
2,Trisha,23,Delhi
3,Swati,24,Banglore


In [40]:
df['Name']

0    Bhimrao
1    Samarth
2     Trisha
3      Swati
Name: Name, dtype: object

In [43]:
print(type(df['Name']))

<class 'pandas.core.series.Series'>


In [46]:
df.loc[0] # row index

Name    Bhimrao
Age          21
City       Pune
Name: 0, dtype: object

In [47]:
df.loc[1] # row index

Name    Samarth
Age          22
City     Mumbai
Name: 1, dtype: object

In [48]:
df.iloc[0] # column index 

Name    Bhimrao
Age          21
City       Pune
Name: 0, dtype: object

In [49]:
df.iloc[1]

Name    Samarth
Age          22
City     Mumbai
Name: 1, dtype: object

In [50]:
df.iloc[0][2]

  df.iloc[0][2]


'Pune'

In [51]:
df.iloc[1][2]

  df.iloc[1][2]


'Mumbai'

In [52]:
# Accesing specified elements
df['Name']

0    Bhimrao
1    Samarth
2     Trisha
3      Swati
Name: Name, dtype: object

In [53]:
df.at[1,'Age']

np.int64(22)

In [54]:
df.at[2,'Age']

np.int64(23)

In [55]:
df.at[2,'Name']

'Trisha'

In [56]:
df.iat[1,2]

'Mumbai'

In [57]:
df.iat[2,1]

np.int64(23)

In [58]:
# Data Manipulation with dataframe 
# Add new column
df

Unnamed: 0,Name,Age,City
0,Bhimrao,21,Pune
1,Samarth,22,Mumbai
2,Trisha,23,Delhi
3,Swati,24,Banglore


In [59]:
df["Salary"] = [230,330,340,450]

In [60]:
df

Unnamed: 0,Name,Age,City,Salary
0,Bhimrao,21,Pune,230
1,Samarth,22,Mumbai,330
2,Trisha,23,Delhi,340
3,Swati,24,Banglore,450


In [61]:
# Remove the column 

remove =df.drop('Salary', axis=1)  # seach from column but not remaoved from data frame

In [62]:
remove

Unnamed: 0,Name,Age,City
0,Bhimrao,21,Pune
1,Samarth,22,Mumbai
2,Trisha,23,Delhi
3,Swati,24,Banglore


In [63]:
# but if check the dataframe still salary column are present 
df

Unnamed: 0,Name,Age,City,Salary
0,Bhimrao,21,Pune,230
1,Samarth,22,Mumbai,330
2,Trisha,23,Delhi,340
3,Swati,24,Banglore,450


In [64]:
# to remove salary column from the dataframe permantly use inplace=True 
df.drop('Salary',axis=1, inplace=True)

In [65]:
df

Unnamed: 0,Name,Age,City
0,Bhimrao,21,Pune
1,Samarth,22,Mumbai
2,Trisha,23,Delhi
3,Swati,24,Banglore


In [66]:
# Add age to the columns 
df['Age'] = df['Age']+1   # Age is increase by 1 

In [67]:
df

Unnamed: 0,Name,Age,City
0,Bhimrao,22,Pune
1,Samarth,23,Mumbai
2,Trisha,24,Delhi
3,Swati,25,Banglore


In [68]:
# droped based on rows 
df.drop(0, inplace=True)

In [69]:
df

Unnamed: 0,Name,Age,City
1,Samarth,23,Mumbai
2,Trisha,24,Delhi
3,Swati,25,Banglore
