In [1]:
pip install pyspark






[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder \
    .appName("BigBasket Analysis") \
    .getOrCreate()


In [3]:
# Load the dataset from the provided path
file_path = "C:/Users/S560092/Downloads/BigBasket Products.csv"
df = spark.read.csv(file_path, header=True, inferSchema=True)


In [4]:
# Show the first few rows of the dataset
df.show(5)


+--------------------+--------------------+--------------------+--------------------+-----------------+----------+------------+--------------------+------+--------------------+
|               index|             product|            category|        sub_category|            brand|sale_price|market_price|                type|rating|         description|
+--------------------+--------------------+--------------------+--------------------+-----------------+----------+------------+--------------------+------+--------------------+
|                   1|Garlic Oil - Vege...|    Beauty & Hygiene|           Hair Care|Sri Sri Ayurveda |       220|         220|    Hair Oil & Serum|   4.1|This Product cont...|
|                   2|Water Bottle - Or...|Kitchen, Garden &...|Storage & Accesso...|       Mastercook|       180|         180|Water & Fridge Bo...|   2.3|Each product is m...|
|                   3|Brass Angle Deep ...|Cleaning & Household|         Pooja Needs|              Trm|       119| 

In [5]:
# goal1  Top 10 Most Reviewed Products (By count of products per category)
df.groupBy("category").count().orderBy("count", ascending=False).show(10)

+--------------------+-----+
|             product|count|
+--------------------+-----+
|                NULL| 5379|
| tricks & more vi...|   38|
| on occasion manu...|   32|
|Turmeric Powder/A...|   26|
| want a condition...|   25|
| now available in...|   21|
| the actually pro...|   19|
| keeping the hair...|   19|
|          phosphorus|   17|
|           cocktails|   16|
+--------------------+-----+
only showing top 10 rows



root
 |-- index: string (nullable = true)
 |-- product: string (nullable = true)
 |-- category: string (nullable = true)
 |-- sub_category: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- sale_price: string (nullable = true)
 |-- market_price: string (nullable = true)
 |-- type: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- description: string (nullable = true)



In [10]:
from pyspark.sql.functions import col

# Convert rating column to DoubleType
df = df.withColumn("rating", col("rating").cast("double"))


In [12]:
# Goal 3: Rating Distribution
rating_distribution = df.groupBy("rating").count().orderBy("rating").show()


+------+-----+
|rating|count|
+------+-----+
|  NULL|18304|
|   1.0|  387|
|   1.2|    2|
|   1.3|    9|
|   1.4|    6|
|   1.5|   32|
|   1.6|    3|
|   1.7|   22|
|   1.8|   22|
|   1.9|    4|
|   2.0|  237|
|   2.1|   10|
|   2.2|   24|
|   2.3|   94|
|   2.4|   29|
|   2.5|  132|
|   2.6|   58|
|   2.7|  115|
|   2.8|  125|
|   2.9|   79|
+------+-----+
only showing top 20 rows



In [11]:
# Goal 2: Top 10 Products by Average Rating
top_rated_products = df.groupBy("product").avg("rating").orderBy("avg(rating)", ascending=False).show(10)


+--------------------+-----------+
|             product|avg(rating)|
+--------------------+-----------+
|            Guar Gum|        5.0|
|Eau-De-Mehfil Eau...|        5.0|
|Organic Shield - ...|        5.0|
|Lemon and Active ...|        5.0|
|Vitamin D Gummies...|        5.0|
|Coconut & Olive O...|        5.0|
|Glass Belleza Bow...|        5.0|
|Marvel Avengers P...|        5.0|
|Fresh Start Water...|        5.0|
|Borosilicate Glas...|        5.0|
+--------------------+-----------+
only showing top 10 rows



In [14]:
# Check the schema to find any date column
df.printSchema()


root
 |-- index: string (nullable = true)
 |-- product: string (nullable = true)
 |-- category: string (nullable = true)
 |-- sub_category: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- sale_price: string (nullable = true)
 |-- market_price: string (nullable = true)
 |-- type: string (nullable = true)
 |-- rating: double (nullable = true)
 |-- description: string (nullable = true)



In [18]:
from pyspark.sql.functions import col, ceil

# Create synthetic month based on the index or product order.
# Assume we divide the index into groups representing months or periods.
df_with_month = df.withColumn("month", ceil(col("index") / 10))  # Example: Index 1-10 => Month 1, 11-20 => Month 2

# Group by the synthetic 'month' and count the products in each 'month'.
monthly_trends = df_with_month.groupBy("month").count().orderBy("month")

# Show the result.
monthly_trends.show()


+-----+-----+
|month|count|
+-----+-----+
| NULL| 9677|
|    1|   10|
|    2|   10|
|    3|   10|
|    4|   10|
|    5|   10|
|    6|   10|
|    7|   10|
|    8|   10|
|    9|   10|
|   10|   10|
|   11|   10|
|   12|   10|
|   13|   10|
|   14|   10|
|   15|   10|
|   16|   10|
|   17|   10|
|   18|   10|
|   19|   10|
+-----+-----+
only showing top 20 rows



In [20]:
!pip install textblob


Collecting textblob


[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip



  Downloading textblob-0.18.0.post0-py3-none-any.whl.metadata (4.5 kB)
Collecting nltk>=3.8 (from textblob)
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting click (from nltk>=3.8->textblob)
  Downloading click-8.1.7-py3-none-any.whl.metadata (3.0 kB)
Collecting joblib (from nltk>=3.8->textblob)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting tqdm (from nltk>=3.8->textblob)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Downloading textblob-0.18.0.post0-py3-none-any.whl (626 kB)
   ---------------------------------------- 0.0/626.3 kB ? eta -:--:--
   ---------------------------------------- 626.3/626.3 kB 5.8 MB/s eta 0:00:00
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 1.5/1.5 MB 11.3 MB/s eta 0:00:00
Downloading click-8.1.7-py3-none-any.whl (97 kB)
Downloading joblib-1.4.2-py3-none-any.whl (301 kB)
Downloadin

In [21]:
import nltk
nltk.download('movie_reviews')
nltk.download('punkt')


[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\S560092\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\movie_reviews.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\S560092\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [24]:
import pandas as pd

# Goal 5: Sentiment Analysis using Keyword-Based Method

# Sample data (replace with your actual DataFrame)
data = {'product': ['Product A', 'Product B', 'Product C'],
        'description': ['I love this product', 'This is terrible', 'It works as expected']}
df = pd.DataFrame(data)

# Define lists of positive and negative keywords
positive_words = ['love', 'great', 'excellent', 'good', 'amazing', 'fantastic', 'awesome']
negative_words = ['terrible', 'bad', 'horrible', 'awful', 'disappointing', 'poor', 'worse']

# Function to classify sentiment based on keywords
def get_sentiment(text):
    # Convert text to lower case to make the comparison case-insensitive
    text = text.lower()
    
    # Initialize sentiment score (0 for neutral, 1 for positive, -1 for negative)
    sentiment_score = 0
    
    # Check for positive and negative words in the text
    for word in positive_words:
        if word in text:
            sentiment_score += 1
    
    for word in negative_words:
        if word in text:
            sentiment_score -= 1
    
    # Classify sentiment
    if sentiment_score > 0:
        return 'Positive'
    elif sentiment_score < 0:
        return 'Negative'
    else:
        return 'Neutral'

# Apply sentiment analysis to the 'description' column
df['sentiment'] = df['description'].apply(get_sentiment)

# Show the results
print(df[['product', 'sentiment']])


     product sentiment
0  Product A  Positive
1  Product B  Negative
2  Product C   Neutral


In [26]:
import pandas as pd

# Sample data (replace with your actual DataFrame)
data = {'product': ['Product A', 'Product B', 'Product C', 'Product D'],
        'category': ['Electronics', 'Clothing', 'Electronics', 'Clothing']}
df = pd.DataFrame(data)

# Goal 6: Product Categories Popularity using Pandas

# Group by category and count the occurrences
category_popularity = df.groupby('category').size().reset_index(name='count')

# Sort by count in descending order and display top 10
category_popularity_sorted = category_popularity.sort_values(by='count', ascending=False).head(10)

# Show the result
print(category_popularity_sorted)


      category  count
0     Clothing      2
1  Electronics      2


In [29]:
from pyspark.sql.functions import length, col

# Goal 7: Word Count for Product Descriptions in PySpark

df_with_word_count = df_spark.withColumn('word_count', length(col('description')).alias('word_count'))

# Show the result
df_with_word_count.select('product', 'description', 'word_count').show(10)


NameError: name 'df_spark' is not defined

In [30]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import length, col

# Start a Spark session (if not already started)
spark = SparkSession.builder.appName("Word Count").getOrCreate()

# Assuming df is a Pandas DataFrame, convert it to a PySpark DataFrame
df_spark = spark.createDataFrame(df)

# Now you can proceed with Goal 7
df_with_word_count = df_spark.withColumn('word_count', length(col('description')).alias('word_count'))

# Show the result
df_with_word_count.select('product', 'description', 'word_count').show(10)



AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `description` cannot be resolved. Did you mean one of the following? [`product`, `category`].;
'Project [product#275, category#276, length('description) AS word_count#280]
+- LogicalRDD [product#275, category#276], false


In [31]:
# Check the column names of the DataFrame
df_spark.columns


['product', 'category']