# What is pandas??


In [1]:
# Pandas is a powerful and versatile library that simplifies tasks of data manipulation in Python . Pandas is built on top of the
# NumPy library and is particularly well-suited for working with tabular data, such as spreadsheets or SQL tables. Its versatility
# and ease of use make it an essential tool for data analysts, scientists, and engineers working with structured data in Python.


# What can you do using Pandas?

In [2]:
# Pandas are generally used for data science but have you wondered why? This is because pandas are used in conjunction with other 
# libraries that are used for data science. It is built on the top of the NumPy library which means that a lot of structures of 
# NumPy are used or replicated in Pandas. The data produced by Pandas are often used as input for plotting functions of 
# Matplotlib, statistical analysis in SciPy, and machine learning algorithms in Scikit-learn. Here is a list of things that we 
# can do using Pandas

# Data set cleaning, merging, and joining.
# Easy handling of missing data (represented as NaN) in floating point as well as non-floating point data.
# Columns can be inserted and deleted from DataFrame and higher dimensional objects.
# Powerful group by functionality for performing split-apply-combine operations on data sets.
# Data Visulaization

In [3]:
# Install and Import
# pip install pandas
# import pandas as pd

# Type of data Structures

# (1). Series:-

In [4]:
#  A Pandas Series is a one-dimensional labeled array capable of holding data of any type (integer, string, float,
# python objects, etc.). The axis labels are collectively called indexes.
# Pandas Series is nothing but a column in an Excel sheet. Labels need not be unique but must be a hashable type. The object 
# supports both integer and label-based indexing and provides a host of methods for performing operations involving the index.




#Creating a Series:-
import pandas as pd 
import numpy as np
 
ser = pd.Series() 
print("Pandas Series: ", ser) 
 
# simple array 
data = np.array(['v', 'i', 'k', 'a', 's']) 
   
ser = pd.Series(data) 
print("Pandas Series:\n", ser)

Pandas Series:  Series([], dtype: object)
Pandas Series:
 0    v
1    i
2    k
3    a
4    s
dtype: object


In [5]:
# Accessing element of Series:-
# There are two ways through which we can access element of series, they are :

# Accessing Element from Series with Position
# Accessing Element Using Label (index)



# Accessing Element from Series with Position : In order to access the series element refers to the index number. Use the index 
# operator [ ] to access an element in a series. The index must be an integer. In order to access multiple elements from a series,
# we use Slice operation.

# Accessing first 5 elements of Series:-
import pandas as pd
import numpy as np
 
# creating simple array
data = np.array(['g','e','e','k','s','f', 'o','r','g','e','e','k','s'])
ser = pd.Series(data)
  
  
#retrieve the first element
print(ser[:5])


0    g
1    e
2    e
3    k
4    s
dtype: object


In [6]:
# Accessing Element Using Label (index) :
# In order to access an element from series, we have to set values by index label. A Series is like a fixed-size dictionary in 
# that you can get and set values by index label.

# Accessing a single element using index label:-

# creating simple array
data = np.array(['g','e','e','k','s','f', 'o','r','g','e','e','k','s'])
ser = pd.Series(data,index=[10,11,12,13,14,15,16,17,18,19,20,21,22])
  
  
# accessing a element using index element
print(ser[14])

s


# Binary Operation on Series

In [7]:
# We can perform binary operation on series like addition, subtraction and many other operation. In order to perform binary
# operation on series we have to use some function like .add(),.sub() etc..
# creating a series
data = pd.Series([5, 2, 3,7], index=['a', 'b', 'c', 'd'])
 
# creating a series
data1 = pd.Series([1, 6, 4, 9], index=['a', 'b', 'd', 'e'])
 
print(data, "\n\n", data1)

a    5
b    2
c    3
d    7
dtype: int64 

 a    1
b    6
d    4
e    9
dtype: int64


In [8]:
#adding two series:-
data.add(data1, fill_value=0)

a     6.0
b     8.0
c     3.0
d    11.0
e     9.0
dtype: float64

In [9]:
# Subtracting Two series:-
data.sub(data1, fill_value=0)

a    4.0
b   -4.0
c    3.0
d    3.0
e   -9.0
dtype: float64

# Conversion Operation on Series

In [10]:
# In conversion operation we perform various operation like changing datatype of series, changing a series to list etc. In order
# to perform conversion operation we have various function which help in conversion like .astype(), .tolist()
# etc.

In [11]:
# Python program using astype to convert a datatype of series:-
 
# importing pandas module  
import pandas as pd 
   
# reading csv file from url  
data = pd.read_csv("D:/All Data of class/covid_toy.csv") 
    
# dropping null value columns to avoid errors 
data.dropna(inplace = True) 
   
# storing dtype before converting 
before = data.dtypes 
   
# converting dtypes using astype 
data["has_covid"]= data["has_covid"].astype(str) 
data["age"]= data["age"].astype(int) 
   
# storing dtype after converting 
after = data.dtypes 
   
# printing to compare 
print("BEFORE CONVERSION\n", before, "\n") 
print("AFTER CONVERSION\n", after, "\n") 

BEFORE CONVERSION
 age            int64
gender        object
fever        float64
cough         object
city          object
has_covid     object
dtype: object 

AFTER CONVERSION
 age            int32
gender        object
fever        float64
cough         object
city          object
has_covid     object
dtype: object 



In [12]:
# Python program converting a series into list:-
# importing regex module 
import re 
     
# making data frame  
data = pd.read_csv("D:/All Data of class/covid_toy.csv")  
     
# removing null values to avoid errors  
data.dropna(inplace = True)  
   
# storing dtype before operation 
dtype_before = type(data["has_covid"]) 
   
# converting to list 
covid_list = data["has_covid"].tolist() 
   
# storing dtype after operation 
dtype_after = type(covid_list) 
   
# printing dtype 
print("Data type before converting = {}\nData type after converting = {}"
      .format(dtype_before, dtype_after)) 
   
# displaying list 
covid_list 

Data type before converting = <class 'pandas.core.series.Series'>
Data type after converting = <class 'list'>


['No',
 'Yes',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'Yes',
 'No',
 'Yes',
 'No',
 'Yes',
 'Yes',
 'No',
 'Yes',
 'No',
 'Yes',
 'Yes',
 'Yes',
 'No',
 'Yes',
 'No',
 'No',
 'Yes',
 'No',
 'Yes',
 'No',
 'Yes',
 'No',
 'No',
 'No',
 'Yes',
 'No',
 'No',
 'Yes',
 'Yes',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'Yes',
 'No',
 'Yes',
 'Yes',
 'Yes',
 'No',
 'Yes',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'No',
 'Yes',
 'No',
 'Yes',
 'Yes',
 'No',
 'Yes',
 'Yes',
 'Yes',
 'No',
 'Yes',
 'No',
 'No',
 'Yes',
 'Yes',
 'No',
 'No',
 'No',
 'No',
 'No',
 'Yes',
 'No',
 'Yes',
 'No',
 'No',
 'Yes']

# Binary operation methods on series:

In [13]:
# FUNCTION           	DESCRIPTION

# add()	           Method is used to add series or list like objects with same length to the caller series
# sub()	           Method is used to subtract series or list like objects with same length from the caller series
# mul()	           Method is used to multiply series or list like objects with same length with the caller series
# div()	           Method is used to divide series or list like objects with same length by the caller series
# sum()          	Returns the sum of the values for the requested axis
# prod()       	Returns the product of the values for the requested axis
# mean()      	Returns the mean of the values for the requested axis
# pow()      	Method is used to put each element of passed series as exponential power of caller series and returned the results
# abs()	        Method is used to get the absolute numeric value of each element in Series/DataFrame
# cov()	        Method is used to find covariance of two series
 

# Pandas series method:

In [14]:
# FUNCTION	             DESCRIPTION

# Series()                 A pandas Series can be created with the Series() constructor method. This constructor method accepts
# a variety of inputs


# combine_first()	         Method is used to combine two series into one


# count()                	Returns number of non-NA/null observations in the Series


# size()	                Returns the number of elements in the underlying data


# name()	                Method allows to give a name to a Series object, i.e. to the column 


# is_unique()	            Method returns boolean if values in the object are unique


# idxmax()	            Method to extract the index positions of the highest values in a Series


# idxmin()	            Method to extract the index positions of the lowest values in a Series


# sort_values()	        Method is called on a Series to sort the values in ascending or descending order


# sort_index()	        Method is called on a pandas Series to sort it by the index instead of its values


# head()	                Method is used to return a specified number of rows from the beginning of a Series. The method returns 
# a brand new Series


# tail()	                Method is used to return a specified number of rows from the end of a Series. The method returns a brand
# new Series



# le()	                Used to compare every element of Caller series with passed series.It returns True for every element
# which is Less than or Equal to the element in passed series



# ne()	                Used to compare every element of Caller series with passed series. It returns True for every element 
# which is Not Equal to the element in passed series




# ge()	                Used to compare every element of Caller series with passed series. It returns True for every element
# which is Greater than or Equal to the element in passed series




# eq()	                Used to compare every element of Caller series with passed series. It returns True for every element
# which is Equal to the element in passed series



# gt()	                Used to compare two series and return Boolean value for every respective element


# lt()	                Used to compare two series and return Boolean value for every respective element


# clip()	                Used to clip value below and above to passed Least and Max value


# clip_lower()	        Used to clip values below a passed least value


# clip_upper()	        Used to clip values above a passed maximum value


# astype()	            Method is used to change data type of a series


# tolist()	            Method is used to convert a series to list


# get()	                Method is called on a Series to extract values from a Series. This is alternative syntax to the
# traditional bracket syntax



# unique()	            Pandas unique() is used to see the unique values in a particular column


# nunique()	            Pandas nunique() is used to get a count of unique values


# value_counts()	        Method to count the number of the times each unique value occurs in a Series


# factorize()	            Method helps to get the numeric representation of an array by identifying distinct values


# map()	                Method to tie together the values from one object to another


# between()	            Pandas between() method is used on series to check which values lie between first and second argument


# apply()	                Method is called and feeded a Python function as an argument to use the function on every Series value.
# This method is helpful for executing custom operations that are not included in pandas or numpy

In [15]:
# (2). Data Frame:- Pandas DataFrame is a two-dimensional data structure with labeled axes (rows and columns)
    
#Creating a Data Frame:-
import pandas as pd 
   
# Calling DataFrame constructor 
df = pd.DataFrame() 
print(df)
 
# list of strings 
lst = ['Vikas', 'Is', 'Student', 'of', 'Regex', 'software', 'in', 'jaipur'] 
   
# Calling DataFrame constructor on list 
df = pd.DataFrame(lst) 
print(df)

Empty DataFrame
Columns: []
Index: []
          0
0     Vikas
1        Is
2   Student
3        of
4     Regex
5  software
6        in
7    jaipur


# Creating dataframe 


In [16]:
# Creating an empty dataframe : 
import pandas as pd
df = pd.DataFrame()
print(df)

Empty DataFrame
Columns: []
Index: []


In [17]:
#Creating a dataframe using List: 
list = ['Vikas', 'Is', 'Student', 'of', 'Regex', 'software', 'in', 'jaipur'] 

df = pd.DataFrame(list)
df

Unnamed: 0,0
0,Vikas
1,Is
2,Student
3,of
4,Regex
5,software
6,in
7,jaipur


In [18]:
# Creating DataFrame from dict of ndarray/lists: 
#initialise data of list.
data = {'Name':['Vikas', 'kalpana', 'punit', 'Nikita '], 'Age':[23, 21, 17, 22]}
 
# Create DataFrame
df = pd.DataFrame(data)
 
# Print the output.
print(df)

      Name  Age
0    Vikas   23
1  kalpana   21
2    punit   17
3  Nikita    22


In [19]:
#Create pandas dataframe from lists using dictionary
# dictionary of lists
dict = {'name':["Vinatak", "pankaj", "sudhir", "Virat"],
        'degree': ["MBA", "BCA", "M.Tech", "MBA"],
        'score':[90, 40, 80, 98]}
 
df = pd.DataFrame(dict)
 
print(df)

      name  degree  score
0  Vinatak     MBA     90
1   pankaj     BCA     40
2   sudhir  M.Tech     80
3    Virat     MBA     98


# some basic function :-
    

In [20]:
df = pd.read_csv("D:/All Data of class/placement.csv")

In [21]:
df.head()

Unnamed: 0,cgpa,resume_score,placed
0,8.14,6.52,1
1,6.17,5.17,0
2,8.27,8.86,1
3,6.88,7.27,1
4,7.52,7.3,1


In [22]:
df.tail()

Unnamed: 0,cgpa,resume_score,placed
95,6.33,6.38,0
96,8.23,7.76,1
97,6.65,7.78,0
98,8.14,5.63,1
99,6.09,6.61,0


In [23]:
df.describe()

Unnamed: 0,cgpa,resume_score,placed
count,100.0,100.0,100.0
mean,6.9422,6.9305,0.5
std,1.1192,0.979608,0.502519
min,5.27,4.95,0.0
25%,5.98,6.19,0.0
50%,6.62,7.055,0.5
75%,8.045,7.64,1.0
max,9.4,9.06,1.0


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   cgpa          100 non-null    float64
 1   resume_score  100 non-null    float64
 2   placed        100 non-null    int64  
dtypes: float64(2), int64(1)
memory usage: 2.5 KB


In [25]:
df.dtypes

cgpa            float64
resume_score    float64
placed            int64
dtype: object

In [26]:
df.columns

Index(['cgpa', 'resume_score', 'placed'], dtype='object')

In [27]:
top_left_corner_df = df.iloc[:5,:3]
top_left_corner_df

Unnamed: 0,cgpa,resume_score,placed
0,8.14,6.52,1
1,6.17,5.17,0
2,8.27,8.86,1
3,6.88,7.27,1
4,7.52,7.3,1


In [28]:
df.empty

False

In [29]:
# Drop columns:-
df = df.drop(columns =['cgpa'], axis= 1)

In [30]:
df

Unnamed: 0,resume_score,placed
0,6.52,1
1,5.17,0
2,8.86,1
3,7.27,1
4,7.30,1
...,...,...
95,6.38,0
96,7.76,1
97,7.78,0
98,5.63,1


In [31]:
df = pd.read_csv("D:/All Data of class/placement.csv")

In [32]:
df.filter(items = ['cgpa', 'placed'])

Unnamed: 0,cgpa,placed
0,8.14,1
1,6.17,0
2,8.27,1
3,6.88,1
4,7.52,1
...,...,...
95,6.33,0
96,8.23,1
97,6.65,0
98,8.14,1


In [33]:
df.filter(items = [5,6,7], axis = 0)

Unnamed: 0,cgpa,resume_score,placed
5,8.77,6.19,1
6,5.34,7.09,0
7,6.56,6.29,0


In [34]:
# rename of columns:-
df.rename(columns = {'cgpa':'Marks',
                    'resume_score':'r_marks'})

Unnamed: 0,Marks,r_marks,placed
0,8.14,6.52,1
1,6.17,5.17,0
2,8.27,8.86,1
3,6.88,7.27,1
4,7.52,7.30,1
...,...,...,...
95,6.33,6.38,0
96,8.23,7.76,1
97,6.65,7.78,0
98,8.14,5.63,1


In [35]:
df['new_resume'] = df['resume_score'].where(df['resume_score']>5 , other = 0)
df

Unnamed: 0,cgpa,resume_score,placed,new_resume
0,8.14,6.52,1,6.52
1,6.17,5.17,0,5.17
2,8.27,8.86,1,8.86
3,6.88,7.27,1,7.27
4,7.52,7.30,1,7.30
...,...,...,...,...
95,6.33,6.38,0,6.38
96,8.23,7.76,1,7.76
97,6.65,7.78,0,7.78
98,8.14,5.63,1,5.63


# Pandas Extracting rows using .loc[]

In [36]:
# Python is a great language for doing data analysis, primarily because of the fantastic ecosystem of data-centric Python packages
# . Pandas is one of those packages and makes importing and analyzing data much easier.Pandas provide a unique method to retrieve
# rows from a Data frame. DataFrame.loc[] method is a method that takes only index labels and returns row or dataframe if the 
# index label exists in the caller data frame.


# For single row:- 
