## Here we will push dataset inside MongoDb Atlas

In [1]:
import pandas as pd 
from pymongo import MongoClient
import certifi
from  dotenv import load_dotenv

In [2]:
# load both dataset
white_wine_df = pd.read_csv("winequality-white.csv" , delimiter = ';') 
red_wine_df = pd.read_csv("winequality-red.csv" , delimiter = ";")

In [3]:
white_wine_df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [4]:
red_wine_df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [5]:
print(white_wine_df.shape)
print(red_wine_df.shape)

(4898, 12)
(1599, 12)


In [6]:
# add new column named wine type
white_wine_df.insert(0 , column = 'wine type' , value = 'white')
red_wine_df.insert(0 , column = 'wine type' , value = 'red')

In [7]:
red_wine_df.head()

Unnamed: 0,wine type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,red,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,red,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,red,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,red,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,red,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [8]:
white_wine_df.head()

Unnamed: 0,wine type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,white,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,white,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,white,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [9]:
df = pd.concat([white_wine_df , red_wine_df] , axis = 0 , ignore_index = True)
df.sample(5)

Unnamed: 0,wine type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
987,white,7.4,0.3,0.3,5.2,0.053,45.0,163.0,0.9941,3.12,0.45,10.3,6
2200,white,6.4,0.24,0.23,7.3,0.069,31.0,157.0,0.9962,3.25,0.53,9.1,5
2391,white,6.5,0.24,0.29,8.2,0.043,32.0,156.0,0.99453,3.13,0.7,10.1,6
5319,red,6.6,0.84,0.03,2.3,0.059,32.0,48.0,0.9952,3.52,0.56,12.3,7
2654,white,6.9,0.4,0.22,5.95,0.081,76.0,303.0,0.99705,3.4,0.57,9.4,5


In [12]:
# Shuffle rows
df = df.sample(frac = 1 , random_state = 2).reset_index(drop = True)

In [13]:
# as mongo expects dict format data so convert dataframe into dict
data = df.to_dict(orient = 'records')

In [14]:
print(df.shape)
print(len(data))

(6497, 13)
6497


In [15]:
data

[{'wine type': 'white',
  'fixed acidity': 6.0,
  'volatile acidity': 0.18,
  'citric acid': 0.31,
  'residual sugar': 1.4,
  'chlorides': 0.036,
  'free sulfur dioxide': 14.0,
  'total sulfur dioxide': 75.0,
  'density': 0.99085,
  'pH': 3.34,
  'sulphates': 0.58,
  'alcohol': 11.1,
  'quality': 8},
 {'wine type': 'white',
  'fixed acidity': 5.3,
  'volatile acidity': 0.395,
  'citric acid': 0.07,
  'residual sugar': 1.3,
  'chlorides': 0.035,
  'free sulfur dioxide': 26.0,
  'total sulfur dioxide': 102.0,
  'density': 0.992,
  'pH': 3.5,
  'sulphates': 0.35,
  'alcohol': 10.6,
  'quality': 6},
 {'wine type': 'red',
  'fixed acidity': 8.1,
  'volatile acidity': 0.56,
  'citric acid': 0.28,
  'residual sugar': 1.7,
  'chlorides': 0.368,
  'free sulfur dioxide': 16.0,
  'total sulfur dioxide': 56.0,
  'density': 0.9968,
  'pH': 3.11,
  'sulphates': 1.28,
  'alcohol': 9.3,
  'quality': 5},
 {'wine type': 'white',
  'fixed acidity': 6.4,
  'volatile acidity': 0.22,
  'citric acid': 0.34,


### steps to push data into mongoDb
- connect with the cluster
- create a database
- create a collection
- insert the data into mongo 

In [16]:
import os 
load_dotenv()
DATABASE_URL = os.getenv("DATABASE_URL") # load from .env file
DATABASE_NAME = "wineData"
COLLECTION_NAME = "wineCollection"

In [17]:
import certifi
# connect to the database
client = MongoClient(DATABASE_URL , tlsCAFile = certifi.where())

In [18]:
type(client)

pymongo.synchronous.mongo_client.MongoClient

In [19]:
# create the database
database = client[DATABASE_NAME]
# create the collection
collection = database[COLLECTION_NAME]

In [20]:
# now insert the data into the collection
records = collection.insert_many(data)

In [21]:
records

InsertManyResult([ObjectId('68afcc53c87d04415342d4fb'), ObjectId('68afcc53c87d04415342d4fc'), ObjectId('68afcc53c87d04415342d4fd'), ObjectId('68afcc53c87d04415342d4fe'), ObjectId('68afcc53c87d04415342d4ff'), ObjectId('68afcc53c87d04415342d500'), ObjectId('68afcc53c87d04415342d501'), ObjectId('68afcc53c87d04415342d502'), ObjectId('68afcc53c87d04415342d503'), ObjectId('68afcc53c87d04415342d504'), ObjectId('68afcc53c87d04415342d505'), ObjectId('68afcc53c87d04415342d506'), ObjectId('68afcc53c87d04415342d507'), ObjectId('68afcc53c87d04415342d508'), ObjectId('68afcc53c87d04415342d509'), ObjectId('68afcc53c87d04415342d50a'), ObjectId('68afcc53c87d04415342d50b'), ObjectId('68afcc53c87d04415342d50c'), ObjectId('68afcc53c87d04415342d50d'), ObjectId('68afcc53c87d04415342d50e'), ObjectId('68afcc53c87d04415342d50f'), ObjectId('68afcc53c87d04415342d510'), ObjectId('68afcc53c87d04415342d511'), ObjectId('68afcc53c87d04415342d512'), ObjectId('68afcc53c87d04415342d513'), ObjectId('68afcc53c87d04415342d5

In [22]:
# get all data from the database(checking data pushed have any issue or not)
data_list = list(collection.find())
len(data_list)

6497

In [23]:
test_df = pd.DataFrame(data_list)
test_df.head()

Unnamed: 0,_id,wine type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,68afcc53c87d04415342d4fb,white,6.0,0.18,0.31,1.4,0.036,14.0,75.0,0.99085,3.34,0.58,11.1,8
1,68afcc53c87d04415342d4fc,white,5.3,0.395,0.07,1.3,0.035,26.0,102.0,0.992,3.5,0.35,10.6,6
2,68afcc53c87d04415342d4fd,red,8.1,0.56,0.28,1.7,0.368,16.0,56.0,0.9968,3.11,1.28,9.3,5
3,68afcc53c87d04415342d4fe,white,6.4,0.22,0.34,1.4,0.023,56.0,115.0,0.98958,3.18,0.7,11.7,6
4,68afcc53c87d04415342d4ff,red,9.4,0.27,0.53,2.4,0.074,6.0,18.0,0.9962,3.2,1.13,12.0,7


### Data Pushed to MongoDb