<a href="https://colab.research.google.com/github/VINY1958/polars/blob/main/Polars_and_Pandas.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
phamletunhi_data_analysis2024_test_data_polars_vs_pandas_path = kagglehub.dataset_download('phamletunhi/data-analysis2024-test-data-polars-vs-pandas')

print('Data source import complete.')

# Polars & Pandas
## Similarities
### DataFrame Creation:

In [None]:
# Pandas
import pandas as pd
data = {'A': [1, 2, 3], 'B': [4, 5, 6]}
df_pandas = pd.DataFrame(data)

# Polars
import polars as pl
data = {'A': [1, 2, 3], 'B': [4, 5, 6]}
df_polars = pl.DataFrame(data)

In [None]:
print(df_pandas)
print(df_polars)

### Column Selection:



In [None]:
# Pandas
column_pandas = df_pandas['A']

# Polars
column_polars = df_polars['A']

print("pandas: ",column_pandas)
print("-----------------------------------")
print("polars: ",column_polars)

## Differences
### Adding Columns:



In [None]:
# Pandas
df_pandas['C'] = df_pandas['A'] + df_pandas['B']

# Polars
df_polars = df_polars.with_columns((pl.col('A') + pl.col('B')).alias('C'))

print("Pandas: \n", df_pandas)
print("Polars: ",df_polars)

### Filtering Rows:


In [None]:
# Pandas
filtered_pandas = df_pandas[df_pandas['A'] > 2]

# Polars
filtered_polars = df_polars.filter(pl.col('A') > 2)

print("Pandas: \n",filtered_pandas)

print("Polars: ",filtered_polars)

### Group By and Aggregation:


In [None]:
# Pandas
grouped_pandas = df_pandas.groupby('A').agg({'B': 'mean'})

# Polars
grouped_polars = df_polars.group_by('A').agg(pl.mean('B').alias('mean_B'))

print(grouped_pandas)
print(grouped_polars)

### Chaining Operations:



In [None]:
# Pandas
df_pandas['C'] = df_pandas['A'] + df_pandas['B']
result_pandas = df_pandas.groupby('C').agg({'B': 'mean'})

# Polars
result_polars = (df_polars.with_columns((pl.col('A') + pl.col('B')).alias('C'))
           .group_by('C')
           .agg(pl.mean('B').alias('mean_B')))

print(result_pandas)
print(result_polars)

### Conversion to CSV Files:


In [None]:
# Pandas
df_pandas.to_csv('pandas.csv')

# Polars
df_polars.write_csv('polars.csv')

## **Compare pandas vs polars**

In [None]:
import time
import matplotlib.pyplot as plt

start_time = time.time()
df_pandas = pd.read_csv('/kaggle/input/data-analysis2024-test-data-polars-vs-pandas/numeric_dataset_10000000.csv')
end_time = time.time()
pandas_load_time = end_time - start_time

start_time = time.time()
df_polars = pl.read_csv('/kaggle/input/data-analysis2024-test-data-polars-vs-pandas/numeric_dataset_10000000.csv')
end_time = time.time()
polars_load_time = end_time - start_time

In [None]:
libraries = ['Pandas', 'Polars']
load_times = [pandas_load_time, polars_load_time]

plt.figure(figsize=(10, 6))
plt.bar(libraries, load_times, color=['#f7bc48', '#69b6cc'])
plt.xlabel('Library')
plt.ylabel('Load Time (seconds)')
plt.title('Load Time Comparison Between Pandas and Polars')
plt.show()