In [1]:
import pandas as pd

### Opening a Local CSV file

In [2]:
df = pd.read_csv('C:/Users/hp/Desktop/Machine Learning/Datasets/ObesityDataSet.csv')

In [3]:
# Use raw string (by adding an 'r' before the string)
df = pd.read_csv(r'C:\Users\hp\Desktop\Machine Learning\Datasets\ObesityDataSet.csv')

In [4]:
df.head()

Unnamed: 0,Age,Gender,Height,Weight,CALC,FAVC,FCVC,NCP,SCC,SMOKE,CH2O,family_history_with_overweight,FAF,TUE,CAEC,MTRANS,NObeyesdad
0,21.0,Female,1.62,64.0,no,no,2.0,3.0,no,no,2.0,yes,0.0,1.0,Sometimes,Public_Transportation,Normal_Weight
1,21.0,Female,1.52,56.0,Sometimes,no,3.0,3.0,yes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,23.0,Male,1.8,77.0,Frequently,no,2.0,3.0,no,no,2.0,yes,2.0,1.0,Sometimes,Public_Transportation,Normal_Weight
3,27.0,Male,1.8,87.0,Frequently,no,3.0,3.0,no,no,2.0,no,2.0,0.0,Sometimes,Walking,Overweight_Level_I
4,22.0,Male,1.78,89.8,Sometimes,no,2.0,1.0,no,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


### Opening a CSV file from an URL

In [5]:
import requests 
from io import StringIO

In [6]:
url = "https://raw.githubusercontent.com/cs109/2014_data/master/countries.csv"
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}
req = requests.get(url, headers = headers)
data = StringIO(req.text)

In [7]:
df = pd.read_csv(data)

In [8]:
df.head()

Unnamed: 0,Country,Region
0,Algeria,AFRICA
1,Angola,AFRICA
2,Benin,AFRICA
3,Botswana,AFRICA
4,Burkina,AFRICA


### Sep Parameter

In [9]:
# Specify that the file is tab-separated
df = pd.read_csv('C:/Users/hp/Desktop/Machine Learning/Datasets/Restaurant_Reviews.tsv', sep='\t')

print(df.head())

                                              Review  Liked
0                           Wow... Loved this place.      1
1                                 Crust is not good.      0
2          Not tasty and the texture was just nasty.      0
3  Stopped by during the late May bank holiday of...      1
4  The selection on the menu was great and so wer...      1


### Index_col Parameter

In [10]:
# Read the CSV file, setting the 'id' column as the index
df = pd.read_csv('C:/Users/hp/Desktop/Machine Learning/Datasets/data.csv', index_col='id')

# Display the DataFrame
print(df)

    name  age         city
id                        
1   John   23     New York
2   Anna   22  Los Angeles
3   Mike   24      Chicago


### Header Parameter

In [11]:
# Read the CSV with header on the 3rd row (index 2)
df = pd.read_csv('C:/Users/hp/Desktop/Machine Learning/Datasets/data.csv', header=0)

# Display the DataFrame
print(df)

   id  name  age         city
0   1  John   23     New York
1   2  Anna   22  Los Angeles
2   3  Mike   24      Chicago


### use_cols Parameter

In [12]:
# Read only the 'name' and 'city' columns
df = pd.read_csv('C:/Users/hp/Desktop/Machine Learning/Datasets/data.csv', usecols=['name', 'city'])

# Display the DataFrame
print(df)

   name         city
0  John     New York
1  Anna  Los Angeles
2  Mike      Chicago


### Squeeze Parameter

It is used to convert the DataFrame into a Series when the CSV file contains only a single column (or a single row of data). 
If the parameter is set to True

### Skiprows/nrows Parameter

In [14]:
# Using skiprows and nrows to skip the first 2 rows and read the next 3 rows
df = pd.read_csv('C:/Users/hp/Desktop/Machine Learning/Datasets/data.csv', skiprows=2, nrows=3)

# Display the DataFrame
print(df)

   2  Anna  22 Los Angeles
0  3  Mike  24     Chicago


### Encoding Parameter

Common Encoding Formats:
* utf-8: Default encoding for most modern files, especially those without special characters.
* latin1 (ISO-8859-1): Often used for files that contain accented characters (common in European languages).
* utf-16: Used when the file is encoded in UTF-16.
* cp1252: Common in files created on Windows systems.
* ascii: Basic American Standard Code for Information Interchange, without special characters.

### dtype Parameter

In [17]:
# Specify data types for the columns
df = pd.read_csv('C:/Users/hp/Desktop/Machine Learning/Datasets/data.csv', dtype={'age': float})

# Display the DataFrame
print(df)

   id  name   age         city
0   1  John  23.0     New York
1   2  Anna  22.0  Los Angeles
2   3  Mike  24.0      Chicago


### Handling Dates

In [21]:
df = pd.read_csv('C:/Users/hp/Desktop/Machine Learning/Datasets/data.csv', parse_dates=['date'])

In [23]:
df.head().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   id      3 non-null      int64         
 1   name    3 non-null      object        
 2   age     3 non-null      int64         
 3   city    3 non-null      object        
 4   date    3 non-null      datetime64[ns]
dtypes: datetime64[ns](1), int64(2), object(2)
memory usage: 248.0+ bytes


### How to Load Large Datasets in Chunks with pandas

In [24]:
# Create an empty DataFrame to store all the chunks after processing
chunk_size = 1000
chunks = []

# Load the dataset in chunks
for chunk in pd.read_csv('large_data.csv', chunksize=chunk_size):
    print(chunk.shape)  
    
    # Append the chunk to a list (if you want to concatenate all chunks into one)
    chunks.append(chunk)

# Concatenate all chunks into a single DataFrame
df = pd.concat(chunks, axis=0)
print(df.shape)

FileNotFoundError: [Errno 2] No such file or directory: 'large_data.csv'