# Gathering Data
 ## 1. CSV 
 ## 2. JSON/SQL
 ## 3. Fetch API 
 ## 4. Web scraping
  


## TODay CSV

In [None]:
# csv stands for Comma Separated Values
# It is a simple file format used to store tabular data,
# such as a spreadsheet or database.
# Each line in a CSV file represents a data record,
# and each record consists of one or more fields separated by commas.   
# CSV files are widely used for data exchange between different applications
# because they are easy to read and write for both humans and machines. 
# They can be opened and edited using text editors, spreadsheet software like Microsoft Excel,
# or imported into programming languages for data analysis and manipulation.
# Tsv stands for Tab Separated Values
# It is a file format used to store tabular data,   
# similar to CSV (Comma Separated Values) files,
# but instead of using commas to separate fields,

In [None]:
import pandas as pd 

In [None]:
# opening csv file using pandas
data = pandas.read_csv('placement.csv')

# OPening a cv file from a URL


In [None]:
import requests
from io import StringIO
url='https://people.sc.fsu.edu/~jburkardt/data/csv/hw_200.csv'
response=requests.get(url)
data=StringIO(response.text)    
df=pandas.read_csv(data)
df

# sep parameter

In [None]:
#  sep paramerter in pandas read_csv function   
#  what is the separator used in the file
#  default is comma (,) 
# for tsv files we need to specify sep='\t'
# names=['sno','name','age','city'] is used to provide custom column names
data = pandas.read_csv('titanic.tsv', sep='\t',names=['sno','name','age','city'])
data

# 5. Index_col parameter 


In [None]:
pd.read_csv('placement.csv',index_col='cgpa')  
#   index_col is used to set a specific column as the index of the DataFrame
#  instead of the default integer index. 

# 6. Header parameter

In [None]:
pd.read_csv('placement.csv',header=0)
# header=0 indicates that the first row of the CSV file
# should be used as the header (column names) for the DataFrame.

# 7. use_cols parameter 

In [None]:
# use_cols parameter in pandas read_csv function is used to specify
# a subset of columns to be read from the CSV file. 
pd.read_csv('placement.csv',usecols=['iq','cgpa'])  
# In this example, only the 'name' and 'cgpa' columns will be read  

# 8. Squeeze parameters 

In [None]:
#squeeze parameter in pandas read_csv function is used to convert a single-column DataFrame
# into a Series.    
# If the CSV file contains only one column, setting squeeze=True will return a Series instead of a DataFrame.
pd.read_csv('placement.csv',usecols=['cgpa'].squeeze(columns))
# current version of pandas does not support squeeze parameter

# 9. Skiprows/nrows Parameter

In [None]:
pd.read_csv('placement.csv',skiprows=[1,2])
# skiprows parameter in pandas read_csv function is used to skip specific rows
# from the beginning of the CSV file while reading it into a DataFrame. 
# In this example, the first two rows (index 1 and 2) of the CSV file will be skipped.
# The resulting DataFrame will start from the third row of the CSV file.
# nrows parameter in pandas read_csv function is used to limit the number of rows
# to be read from the CSV file into a DataFrame.    
pd.read_csv('placement.csv',nrows=5)
# In this example, only the first 5 rows of the CSV file will be read into the DataFrame.

# 10. Encoding parameter 

In [None]:
# encoding parameter in pandas read_csv function is used to specify the character encoding of the CSV file. 
pd.read_csv('placement.csv',encoding='utf-8')
# In this example, the CSV file is read using UTF-8 encoding, which is a common encoding for text files.    
# This is important when dealing with files that contain special characters or non-ASCII text to ensure proper reading and interpretation of the data.  


# 11. Skip bad lines 


In [None]:
# skip bad lines while reading csv file
pd.read_csv('placement.csv',error_bad_lines=False)  
# error_bad_lines parameter in pandas read_csv function is used to skip lines with too many fields (bad lines)
# while reading a CSV file into a DataFrame.    
# error_bad_lines=False will ignore any lines that do not conform to the expected format
# and continue reading the rest of the file without raising an error.
# new verions of pandas does not support error_bad_lines parameter

In [None]:
pd.read_csv('placement.csv')

# 12. dtypes parameter

In [None]:
# dtype parameter in pandas read_csv function is used to specify the data types
# for one or more columns in the CSV file while reading it into a DataFrame.        
pd.read_csv('placement.csv',dtype={'cgpa':float,'iq':int})  
# In this example, the 'cgpa' column will be read as float data type

# 13. Handling Dates 

In [None]:

pd.read_csv('placement.csv',parse_dates=['date_column'])  
# parse_dates parameter in pandas read_csv function is used to specify

# 14. Convertors 


In [None]:
pd.read_csv('placement.csv',usecols=['cgpa'])

In [None]:
# dtype parameter in pandas read_csv function is used to specify the data types
# for one or more columns in the CSV file while reading it into a DataFrame.        
pd.read_csv('placement.csv',dtype={'cgpa':float,'iq':int})

In [None]:
def cgpa_change(cgpa):
    try:
        cgpa = float(cgpa)   # convert string â†’ number
        if cgpa >= 9:
            return 'A+'
        elif cgpa >= 8:
            return 'A'
        elif cgpa >= 7:
            return 'B'
        else:
            return 'C'
    except:
        return None


In [None]:
cgpa_change(9.1)

In [None]:
# using convertor parameter in pandas read_csv function
pd.read_csv('placement.csv',converters={'cgpa':cgpa_change})    
# converters={'cgpa':cgpa_change} is used to apply the cgpa_change function
# to the 'cgpa' column while reading the CSV file into a DataFrame.

# 15. na_values parameter 

In [None]:
pd.read_csv('placement.csv')

In [None]:
# na_values parameter is used to specify additional strings to recognize as NA/NaN.
pd.read_csv('placement.csv',na_values=['123.0'])


# 16. Loading a huge dataset in chunks


In [90]:
dfs =pd.read_csv('placement.csv',chunksize=2)
for chunk in dfs:
    print(chunk.shape)    

(2, 4)
(2, 4)
(2, 4)
(2, 4)
(2, 4)
(2, 4)
(2, 4)
(2, 4)
(2, 4)
(2, 4)
(2, 4)
(2, 4)
(2, 4)
(2, 4)
(2, 4)
(2, 4)
(2, 4)
(2, 4)
(2, 4)
(2, 4)
(2, 4)
(2, 4)
(2, 4)
(2, 4)
(2, 4)
(2, 4)
(2, 4)
(2, 4)
(2, 4)
(2, 4)
(2, 4)
(2, 4)
(2, 4)
(2, 4)
(2, 4)
(2, 4)
(2, 4)
(2, 4)
(2, 4)
(2, 4)
(2, 4)
(2, 4)
(2, 4)
(2, 4)
(2, 4)
(2, 4)
(2, 4)
(2, 4)
(2, 4)
(2, 4)
