In [1]:
import pandas as pd
import numpy as np

#### 1. Creating a Pandas DataFrame

In [6]:
list1= {
    'col_1':[1, 2, 3],
    'col_2':['A','B', 'C'],
    'col_3':['a', 'b', 'c']
}
print(pd.DataFrame(list1))

   col_1 col_2 col_3
0      1     A     a
1      2     B     b
2      3     C     c


In [5]:
# a) From a dictionary of lists or NumPy arrays (keys become column names)
data_dict = {
    'col1': [1, 2, 3, 4],
    'col2': [10, 20, 30, 40],
    'col3': ['A', 'B', 'C', 'D']
}
df_from_dict = pd.DataFrame(data_dict)
print("DataFrame from dict of lists (default index):\n", df_from_dict)

DataFrame from dict of lists (default index):
    col1  col2 col3
0     1    10    A
1     2    20    B
2     3    30    C
3     4    40    D


In [18]:
# Specify a custom index

custom_index = ['row1', 'row2', 'row3', 'row4']
df_from_dict_custom_index = pd.DataFrame(data_dict, index = custom_index)
print("DataFrame from dict of lists (custom index):\n", df_from_dict_custom_index)

DataFrame from dict of lists (custom index):
       col1  col2 col3
row1     1    10    A
row2     2    20    B
row3     3    30    C
row4     4    40    D


In [16]:
# b) From a dictionary of Pandas Series
# Series indices are aligned; keys become column names.
series_dict = {
    'population': pd.Series([1.5e6, 1.7e6, 3.6e6, 2.4e6, 2.1e6], index=['Ohio', 'Texas', 'California', 'Florida', 'Illinois']),
    'area_sq_km': pd.Series([119e3, 695e3, 423e3, 170e3], index=['Ohio', 'Texas', 'California', 'Florida']) # Missing Illinois
}
df_from_series = pd.DataFrame(series_dict)
print("DataFrame from dict of Series (indices aligned, NaN where missing):\n", df_from_series)
print("-" * 20)

DataFrame from dict of Series (indices aligned, NaN where missing):
             population  area_sq_km
California   3600000.0    423000.0
Florida      2400000.0    170000.0
Illinois     2100000.0         NaN
Ohio         1500000.0    119000.0
Texas        1700000.0    695000.0
--------------------


In [19]:
# c) From a list of dictionaries
# Each dictionary represents a row; keys become column names.
# Pandas infers column names from keys; NaN for missing keys in a dict.
list_of_dicts = [
    {'a': 1, 'b': 10},
    {'a': 2, 'b': 20, 'c': 100}, # 'c' is missing in the first dict
    {'a': 3, 'd': 200}          # 'b', 'c' are missing here
]
df_from_list_dicts = pd.DataFrame(list_of_dicts, index=['r1', 'r2', 'r3'])
print("DataFrame from list of dicts:\n", df_from_list_dicts)

DataFrame from list of dicts:
     a     b      c      d
r1  1  10.0    NaN    NaN
r2  2  20.0  100.0    NaN
r3  3   NaN    NaN  200.0


In [28]:
# d) From a 2D NumPy array
numpy_2d = np.random.randn(4, 3) # 4x3 array of random numbers
df_from_numpy = pd.DataFrame(numpy_2d, index=['n1', 'n2', 'n3', 'n4'], columns=['X', 'Y', 'Z'])
print("DataFrame from 2D NumPy array:\n", df_from_numpy)

DataFrame from 2D NumPy array:
            X         Y         Z
n1 -0.903518  0.988431 -0.450755
n2  1.071125 -0.217376  1.462624
n3 -0.203345  0.176490  0.636177
n4  0.643788 -2.491312  0.020752


In [30]:
# e) From another DataFrame (creates a copy by default)
df_copy = pd.DataFrame(df_from_dict)
print("DataFrame created as a copy of another:\n", df_copy)
# Modifying df_copy will not affect df_from_dict

DataFrame created as a copy of another:
    col1  col2 col3
0     1    10    A
1     2    20    B
2     3    30    C
3     4    40    D


#### 2. DataFrame Attributes

In [31]:
print("DataFrame:\n", df_from_series)

DataFrame:
             population  area_sq_km
California   3600000.0    423000.0
Florida      2400000.0    170000.0
Illinois     2100000.0         NaN
Ohio         1500000.0    119000.0
Texas        1700000.0    695000.0


In [39]:
# .columns: Get the column index object
print(f"Columns: {df_from_series.columns}")

Columns: Index(['population', 'area_sq_km'], dtype='object')


In [41]:
# .values: Get the data as a 2D NumPy array
# Note: If dtypes are mixed, the resulting NumPy array dtype will be 'object'
print(f"\nValues (NumPy array):\n{df_from_series.values}\n")
print(f"Type of values: {type(df_from_series.values)}\n")
print(f"Dtype of values array: {df_from_series.values.dtype}") # Might be float64 if all cols are numeric, otherwise object


Values (NumPy array):
[[3600000.  423000.]
 [2400000.  170000.]
 [2100000.      nan]
 [1500000.  119000.]
 [1700000.  695000.]]

Type of values: <class 'numpy.ndarray'>

Dtype of values array: float64


In [42]:
# .dtypes: Get the data type of each column as a Series
print(f"\nData types of columns (dtypes):\n{df_from_series.dtypes}")


Data types of columns (dtypes):
population    float64
area_sq_km    float64
dtype: object


In [43]:
# .shape: Get the shape (rows, columns) as a tuple
print(f"\nShape (rows, columns): {df_from_series.shape}")


Shape (rows, columns): (5, 2)


In [44]:
# .size: Get the total number of elements (rows * columns)
print(f"Size (total elements): {df_from_series.size}")

Size (total elements): 10


#### 3. Basic Inspection Methods

In [45]:
print("DataFrame:\n", df_from_series)

DataFrame:
             population  area_sq_km
California   3600000.0    423000.0
Florida      2400000.0    170000.0
Illinois     2100000.0         NaN
Ohio         1500000.0    119000.0
Texas        1700000.0    695000.0


In [46]:
# .head(n=5): View the first n rows (default is 5)
print("\nFirst 3 rows (.head(3)):\n", df_from_series.head(3))


First 3 rows (.head(3)):
             population  area_sq_km
California   3600000.0    423000.0
Florida      2400000.0    170000.0
Illinois     2100000.0         NaN


In [47]:
# .tail(n=5): View the last n rows (default is 5)
print("\nLast 2 rows (.tail(2)):\n", df_from_series.tail(2))


Last 2 rows (.tail(2)):
        population  area_sq_km
Ohio    1500000.0    119000.0
Texas   1700000.0    695000.0


In [48]:
# .info(): Get a concise summary (index dtype, column dtypes, non-null counts, memory usage)
print("\nDataFrame Info (.info()):")
df_from_series.info() # Prints directly


DataFrame Info (.info()):
<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, California to Texas
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   population  5 non-null      float64
 1   area_sq_km  4 non-null      float64
dtypes: float64(2)
memory usage: 120.0+ bytes


In [49]:
# .describe(): Generate descriptive statistics for numerical columns
# Includes count, mean, std, min, 25th percentile, 50th (median), 75th, max
print("\nDescriptive Statistics (.describe()):\n", df_from_series.describe())


Descriptive Statistics (.describe()):
          population     area_sq_km
count  5.000000e+00       4.000000
mean   2.260000e+06  351750.000000
std    8.264381e+05  264639.849607
min    1.500000e+06  119000.000000
25%    1.700000e+06  157250.000000
50%    2.100000e+06  296500.000000
75%    2.400000e+06  491000.000000
max    3.600000e+06  695000.000000


In [50]:
# Describe non-numerical columns (e.g., object/string or categorical)
df_obj = pd.DataFrame({'col_str': ['a', 'b', 'a', 'c', 'b', 'a'], 'col_num': [1,2,3,4,5,6]})
print("DataFrame with object type:\n", df_obj)
print("\nDescribe object columns (.describe(include='object')):\n", df_obj.describe(include='object'))
# Includes count, unique, top (most frequent), freq (frequency of top)

DataFrame with object type:
   col_str  col_num
0       a        1
1       b        2
2       a        3
3       c        4
4       b        5
5       a        6

Describe object columns (.describe(include='object')):
        col_str
count        6
unique       3
top          a
freq         3


In [51]:
# Describe all columns
print("\nDescribe all columns (.describe(include='all')):\n", df_from_series.describe(include='all'))
# Combines numerical and non-numerical summaries (NaN where not applicable)


Describe all columns (.describe(include='all')):
          population     area_sq_km
count  5.000000e+00       4.000000
mean   2.260000e+06  351750.000000
std    8.264381e+05  264639.849607
min    1.500000e+06  119000.000000
25%    1.700000e+06  157250.000000
50%    2.100000e+06  296500.000000
75%    2.400000e+06  491000.000000
max    3.600000e+06  695000.000000
