In [12]:
# Import the pandas library and give it an alias 'pd' for easy reference

import pandas as pd

The pandas library provides powerful data structures for data analysis and manipulation.

is a common alias used for pandas, which allows you to access its functions in a more concise manner.

Example: Creating a DataFrame (table of data) with pandas

In [13]:
mydataset = { 'cars':['toyota', 'BMW', 'crolla'], 
             'Model':[1997, 2024, 2002] }     

In [14]:
print(mydataset)

{'cars': ['toyota', 'BMW', 'crolla'], 'Model': [1997, 2024, 2002]}


In [16]:
myvar = pd.DataFrame(mydataset)

In [17]:
myvar

Unnamed: 0,cars,Model
0,toyota,1997
1,BMW,2024
2,crolla,2002


In [18]:
myvar['Model']

0    1997
1    2024
2    2002
Name: Model, dtype: int64

In [19]:
# The `__version__` attribute of a library like pandas stores the version number.
# This is useful for checking which version of pandas is installed in your environment.

print(pd.__version__)

1.4.2


In [20]:
a = [1,2,3,4,5,6,7,8]

In [21]:
a

[1, 2, 3, 4, 5, 6, 7, 8]

In [23]:
var1 = pd.Series(a)

In [24]:
var1

0    1
1    2
2    3
3    4
4    5
5    6
6    7
7    8
dtype: int64

In [26]:
# Explanation:
# pd.read_csv() is a function from the pandas library that is used to read a CSV (Comma-Separated Values) file.
# It takes the file path (or URL) as an argument and returns a DataFrame, which is a table-like structure.
# The argument 'dataset.csv' is the name of the file you're loading. Make sure the file is in the current working directory or specify the full file path.

df = pd.read_csv('dataset.csv')

In [27]:
# Display the first 5 rows of the DataFrame
df.head()

Unnamed: 0,Cars,Model
0,toyota,1997
1,bmw,2024


In [29]:
df = pd.read_csv('data.csv')

In [30]:
df.head()

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
0,60,110,130,409.1
1,60,117,145,479.0
2,60,103,135,340.0
3,45,109,175,282.4
4,45,117,148,406.0


In [31]:
# Display the last 5 rows of the DataFrame
df.tail()

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
164,60,105,140,290.8
165,60,110,145,300.0
166,60,115,145,310.2
167,75,120,150,320.4
168,75,125,150,330.4


In [32]:
# Display all rows of the DataFrame
print(df.to_string())

     Duration  Pulse  Maxpulse  Calories
0          60    110       130     409.1
1          60    117       145     479.0
2          60    103       135     340.0
3          45    109       175     282.4
4          45    117       148     406.0
5          60    102       127     300.0
6          60    110       136     374.0
7          45    104       134     253.3
8          30    109       133     195.1
9          60     98       124     269.0
10         60    103       147     329.3
11         60    100       120     250.7
12         60    106       128     345.3
13         60    104       132     379.3
14         60     98       123     275.0
15         60     98       120     215.2
16         60    100       120     300.0
17         45     90       112       NaN
18         60    103       123     323.0
19         45     97       125     243.0
20         60    108       131     364.2
21         45    100       119     282.0
22         60    130       101     300.0
23         45   

In [33]:
# Display summary information about the DataFrame
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169 entries, 0 to 168
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Duration  169 non-null    int64  
 1   Pulse     169 non-null    int64  
 2   Maxpulse  169 non-null    int64  
 3   Calories  164 non-null    float64
dtypes: float64(1), int64(3)
memory usage: 5.4 KB


Explanation:
 The `info()` method provides a concise summary of the DataFrame.
It shows:
  1. The number of rows and columns in the DataFrame.
  2. The name of each column.
  3. The data type of each column (e.g., int64, float64, object, etc.).
  4. The number of non-null (non-missing) entries in each column.
  5. The memory usage of the DataFrame (can be useful for large datasets).

In [36]:
# Check for missing values in the DataFrame
print(df.isnull().to_string())

     Duration  Pulse  Maxpulse  Calories
0       False  False     False     False
1       False  False     False     False
2       False  False     False     False
3       False  False     False     False
4       False  False     False     False
5       False  False     False     False
6       False  False     False     False
7       False  False     False     False
8       False  False     False     False
9       False  False     False     False
10      False  False     False     False
11      False  False     False     False
12      False  False     False     False
13      False  False     False     False
14      False  False     False     False
15      False  False     False     False
16      False  False     False     False
17      False  False     False      True
18      False  False     False     False
19      False  False     False     False
20      False  False     False     False
21      False  False     False     False
22      False  False     False     False
23      False  F

In [37]:
#Check for non-null (valid) values in the DataFrame
print(df.notnull().to_string())

     Duration  Pulse  Maxpulse  Calories
0        True   True      True      True
1        True   True      True      True
2        True   True      True      True
3        True   True      True      True
4        True   True      True      True
5        True   True      True      True
6        True   True      True      True
7        True   True      True      True
8        True   True      True      True
9        True   True      True      True
10       True   True      True      True
11       True   True      True      True
12       True   True      True      True
13       True   True      True      True
14       True   True      True      True
15       True   True      True      True
16       True   True      True      True
17       True   True      True     False
18       True   True      True      True
19       True   True      True      True
20       True   True      True      True
21       True   True      True      True
22       True   True      True      True
23       True   

In [40]:
# Remove rows with missing values from the DataFrame and print the result as a string
print(df.dropna().to_string())

     Duration  Pulse  Maxpulse  Calories
0          60    110       130     409.1
1          60    117       145     479.0
2          60    103       135     340.0
3          45    109       175     282.4
4          45    117       148     406.0
5          60    102       127     300.0
6          60    110       136     374.0
7          45    104       134     253.3
8          30    109       133     195.1
9          60     98       124     269.0
10         60    103       147     329.3
11         60    100       120     250.7
12         60    106       128     345.3
13         60    104       132     379.3
14         60     98       123     275.0
15         60     98       120     215.2
16         60    100       120     300.0
18         60    103       123     323.0
19         45     97       125     243.0
20         60    108       131     364.2
21         45    100       119     282.0
22         60    130       101     300.0
23         45    105       132     246.0
24         60   

In [41]:
# Calculate the mean (average) value of the 'Calories' column
mean1 = df['Calories'].mean()

In [42]:
mean1

375.79024390243916

In [43]:
# Fill missing values in the 'Calories' column with the mean value
df['Calories'].fillna(mean1, inplace = True)

In [44]:
print(df.to_string())

     Duration  Pulse  Maxpulse     Calories
0          60    110       130   409.100000
1          60    117       145   479.000000
2          60    103       135   340.000000
3          45    109       175   282.400000
4          45    117       148   406.000000
5          60    102       127   300.000000
6          60    110       136   374.000000
7          45    104       134   253.300000
8          30    109       133   195.100000
9          60     98       124   269.000000
10         60    103       147   329.300000
11         60    100       120   250.700000
12         60    106       128   345.300000
13         60    104       132   379.300000
14         60     98       123   275.000000
15         60     98       120   215.200000
16         60    100       120   300.000000
17         45     90       112   375.790244
18         60    103       123   323.000000
19         45     97       125   243.000000
20         60    108       131   364.200000
21         45    100       119  