In [1]:
import pandas as pd

### <center>Creating Data Frames</center>

The main focus of this chapter is the pandas library for data analysis, and we begin here with an introduction to this library. The pandas library mainly provides data structures and methods for representing and manipulating data. The two main data structures in Pandas are the Series and DataFrame objects, which are used to represent data series and tabular data, respectively. Both of these objects have an index for accessing elements or rows in the data represented by the object. By default, the indices are integers starting from zero, like NumPy arrays, but it is also possible to use any sequence of identifiers as index.

In [2]:
df = pd.DataFrame([[909976, "Sweden"],
                   [8615246, "United Kingdom"],
                   [2872086, "Italy"],
                   [2273305, "France"]])

df.index = ["Stockholm", "London", "Rome", "Paris"]
df.columns = ["Population", "Country"]

df


# Assignment index, columns another way to create a data frame
# df = pd.DataFrame([[909976, "Sweden"],
#                   [8615246, "United Kingdom"],
#                   [2872086, "Italy"],
#                   [2273305, "France"]],
#                  index=["Stockholm", "London", "Rome", "Paris"],
#                  columns=["Population", "Country"])


# Creation of a data frame with a dictionary
# df = pd.DataFrame({"Population": [909976, 8615246, 2872086, 2273305],
#                   "Country": ["Sweden", "United Kingdom", "Italy", "France"]},
#                  index=["Stockholm", "London", "Rome", "Paris"])

Unnamed: 0,Population,Country
Stockholm,909976,Sweden
London,8615246,United Kingdom
Rome,2872086,Italy
Paris,2273305,France


In [3]:
df.info() # Gives general info about the data frame

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, Stockholm to Paris
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Population  4 non-null      int64 
 1   Country     4 non-null      object
dtypes: int64(1), object(1)
memory usage: 96.0+ bytes


In [4]:
# The result of extracting a column from a DataFrame is a new Series object, which we can process and manipulate
# with the methods discussed in the previous section. Rows of a DataFrame instance can be accessed using the loc
# indexer attribute

# Locating values .loc[row, column]
a1 = df.loc["Rome"]
print(a1)
print(type(a1))  # Series is obtained


Population    2872086
Country         Italy
Name: Rome, dtype: object
<class 'pandas.core.series.Series'>


In [5]:
b1 = df.loc[["Rome", "Paris"]]
print(b1)
print(type(b1))  # Data frame is obtained

       Population Country
Rome      2872086   Italy
Paris     2273305  France
<class 'pandas.core.frame.DataFrame'>


In [6]:
c1 = df.loc["London":, "Population"]  # specifying the column
print(c1)
# Same result different approach
# c1 = df["Population"][1:] with this format numerical index can be expressed

London    8615246
Rome      2872086
Paris     2273305
Name: Population, dtype: int64


In [7]:
d1 = df[["Population", "Country"]][1:]
print(d1)

        Population         Country
London     8615246  United Kingdom
Rome       2872086           Italy
Paris      2273305          France


In [8]:
# integer location .iloc[]
ai1 = df.iloc[2]
print(ai1)
print()
bi1 = df.iloc[2:]
print(bi1)
print()
ci1 = df.iloc[1:, 0]
print(ci1)
print()
di1 = df.iloc[1:, :]
print(di1)
print()
print()

Population    2872086
Country         Italy
Name: Rome, dtype: object

       Population Country
Rome      2872086   Italy
Paris     2273305  France

London    8615246
Rome      2872086
Paris     2273305
Name: Population, dtype: int64

        Population         Country
London     8615246  United Kingdom
Rome       2872086           Italy
Paris      2273305          France




####  Changing values in a Data Frame

In [9]:
data_frame = pd.DataFrame({
    "first": ["John", "Jane", "Max"],
    "last": ["Moore", "Carrol", "O'Brian"],
    "email": ["Jm@gmail.com", "Jc@gmail.com", "Mob@gmail.com"]
})

# Rename the columns
data_frame.rename(columns={"first": "first_name", "last": "last_name"},
                  inplace=True)  # inplace is the confirmation button for changes applied
data_frame

Unnamed: 0,first_name,last_name,email
0,John,Moore,Jm@gmail.com
1,Jane,Carrol,Jc@gmail.com
2,Max,O'Brian,Mob@gmail.com


In [10]:
# Rename object in dataframe

data_frame.loc[2, "email"] = "Maxbria@gmail.com"
data_frame

Unnamed: 0,first_name,last_name,email
0,John,Moore,Jm@gmail.com
1,Jane,Carrol,Jc@gmail.com
2,Max,O'Brian,Maxbria@gmail.com


In [11]:
# Applying multiple changes

# .apply() method: works only in a series

len_email = data_frame["email"].apply(len)  # Function is applied to every element
len_email

0    12
1    12
2    17
Name: email, dtype: int64

In [12]:
data_frame["email"] = data_frame["email"].apply(lambda x: x.lower())
data_frame

Unnamed: 0,first_name,last_name,email
0,John,Moore,jm@gmail.com
1,Jane,Carrol,jc@gmail.com
2,Max,O'Brian,maxbria@gmail.com


In [13]:
# .applymap() method: works in the entire Dataframe

len_all = data_frame.applymap(len)
len_all


Unnamed: 0,first_name,last_name,email
0,4,5,12
1,4,6,12
2,3,7,17


####  Columns

In [14]:
# Adding a column
data_frame["full_name"] = data_frame["first_name"] + " " + data_frame["last_name"]
data_frame

Unnamed: 0,first_name,last_name,email,full_name
0,John,Moore,jm@gmail.com,John Moore
1,Jane,Carrol,jc@gmail.com,Jane Carrol
2,Max,O'Brian,maxbria@gmail.com,Max O'Brian


In [15]:
# Deleting a column
data_frame.drop(columns=["full_name"], inplace=True)
data_frame

Unnamed: 0,first_name,last_name,email
0,John,Moore,jm@gmail.com
1,Jane,Carrol,jc@gmail.com
2,Max,O'Brian,maxbria@gmail.com


#### Lines

In [16]:
#  Adding lines
data_frame = data_frame.append({'first_name': 'Tony', 'last_name': 'The Fat'}, ignore_index=True, sort=False)
data_frame

  data_frame = data_frame.append({'first_name': 'Tony', 'last_name': 'The Fat'}, ignore_index=True, sort=False)


Unnamed: 0,first_name,last_name,email
0,John,Moore,jm@gmail.com
1,Jane,Carrol,jc@gmail.com
2,Max,O'Brian,maxbria@gmail.com
3,Tony,The Fat,


In [17]:
# Removing lines
filt = data_frame["first_name"] == "Tony"
data_frame.drop(index=data_frame[filt].index, inplace=True)
data_frame

Unnamed: 0,first_name,last_name,email
0,John,Moore,jm@gmail.com
1,Jane,Carrol,jc@gmail.com
2,Max,O'Brian,maxbria@gmail.com
