In [2]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=55609ab173e1011e3d5d94eec88c7b95586e6dc3f05610d70b4fc26936ffe774
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
import time


# Start timing
start_time = time.time()
# Create a SparkSession
spark = SparkSession.builder \
    .appName("PageTitlesAnalysis") \
    .getOrCreate()

# Read the .out file as RDD
page_views_rdd = spark.sparkContext.textFile("pagecounts-20160101-000000_parsed.out")

# Split each line by whitespace and extract the page size column
page_sizes_rdd = page_views_rdd.map(lambda line: int(line.split(" ")[3]))

# Compute min, max, and average using a loop
total_size = 0
count = 0
min_size = float('inf')
max_size = float('-inf')

for size in page_sizes_rdd.collect():
    total_size += size
    count += 1
    min_size = min(min_size, size)
    max_size = max(max_size, size)

average_size = total_size / count

print("Minimum Page Size:", min_size)
print("Maximum Page Size:", max_size)
print("Average Page Size:", average_size)

# Calculate elapsed time
end_time = time.time()
elapsed_time = end_time - start_time

print("Elapsed time:", elapsed_time)

Minimum Page Size: 0
Maximum Page Size: 141180155987
Average Page Size: 132239.56957446598
Elapsed time: 33.38075542449951


In [None]:
#query 2 spark loop
from pyspark.sql.types import StructType, StructField, StringType
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
import time

# Start timing
start_time = time.time()

spark = SparkSession.builder \
    .appName("PageTitlesAnalysis") \
    .getOrCreate()

# Define the schema based on the provided field meaning
schema = StructType([
    StructField("Project code", StringType(), True),
    StructField("Page title", StringType(), True),
    StructField("Page hits", StringType(), True),
    StructField("Page size", StringType(), True)
])

# Read the .out file as DataFrame using the defined schema
page_views_df = spark.read.option("delimiter", " ").schema(schema) \
    .csv("pagecounts-20160101-000000_parsed.out")

# Filter page titles that start with "The"
page_titles_starting_with_the = page_views_df.filter(col("Page title").startswith("The"))

# Count the total number of page titles starting with "The"
total_page_titles_starting_with_the = page_titles_starting_with_the.count()

# Count the number of page titles starting with "The" that are not part of the English project
non_english_page_titles_starting_with_the = page_titles_starting_with_the \
    .filter(~col("Project code").startswith("en")).count()

print("Total page titles starting with 'The':", total_page_titles_starting_with_the)
print("Page titles starting with 'The' not part of the English project:", non_english_page_titles_starting_with_the)

# Calculate elapsed time
end_time = time.time()
elapsed_time = end_time - start_time

print("Elapsed time:", elapsed_time)


Total page titles starting with 'The': 45020
Page titles starting with 'The' not part of the English project: 9128
Elapsed time: 16.7674777507782


In [6]:
#Spark loop query 3
import re
import time
from pyspark.sql import SparkSession

# Start timing
start_time = time.time()

# Create a Spark session
spark = SparkSession.builder \
    .appName("PageAnalysis") \
    .getOrCreate()

# Read the data
data = spark.read.text("pagecounts-20160101-000000_parsed.out")

# Define a function to extract page titles
def extract_page_title(line):
    title = line.value.split()[1].lower()
    # Remove non-alphanumeric characters
    title = re.sub(r'[^a-zA-Z0-9\s]', '_', title)
    return title

# Apply the function to the DataFrame
page_titles = data.rdd.map(extract_page_title)

# Initialize an empty dictionary to store counts
title_counts = {}

# Loop through each title and update the counts
for title in page_titles.collect():
    title_counts[title] = title_counts.get(title, 0) + 1

# Count the number of unique page titles
unique_titles_count = len(title_counts)
print("Number of unique page titles:", unique_titles_count)

# Stop the Spark session
spark.stop()

# Calculate elapsed time
end_time = time.time()
elapsed_time = end_time - start_time

print("Elapsed time:", elapsed_time)


Number of unique page titles: 2912031
Elapsed time: 43.58415627479553


In [None]:
from pyspark.sql import SparkSession
from collections import Counter
import re
import time

# Create a SparkSession
spark = SparkSession.builder \
    .appName("PageTitlesAnalysis") \
    .getOrCreate()

# Read the dataset file as RDD
page_views_rdd = spark.sparkContext.textFile("pagecounts-20160101-000000_parsed.out")

# Measure the start time
start_time = time.time()

# Extract page titles from each line
titles = page_views_rdd.map(lambda line: line.split(" ")[1].lower())

# Count occurrences of each title
title_counts = Counter(titles.collect())

# Sort titles by count in descending order
sorted_titles = sorted(title_counts.items(), key=lambda x: x[1], reverse=True)

# Print titles and counts in descending order
for title, count in sorted_titles:  # Print only the first 10,000 titles
    print("Title:", title, ", Count:", count)

# Measure the end time
end_time = time.time()

# Print the execution time
print("Execution Time:", end_time - start_time)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Title: template:prc_admin/data/37/16/21/115/235 , Count: 1
Title: template:prc_admin/data/37/16/22/103/225 , Count: 1
Title: template:prc_admin/data/37/16/22/103/238 , Count: 1
Title: template:prc_admin/data/37/16/22/104/271 , Count: 1
Title: template:prc_admin/data/37/16/23/107/400 , Count: 1
Title: template:prc_admin/data/37/16/25/106/223 , Count: 1
Title: template:prc_admin/data/37/16/26/005/220 , Count: 1
Title: template:prc_admin/data/37/16/26/101/203 , Count: 1
Title: template:prc_admin/data/37/17/02/006/010 , Count: 1
Title: template:prc_admin/data/37/17/21/102/205 , Count: 1
Title: template:prc_admin/data/37/17/21/110/226 , Count: 1
Title: template:prc_admin/data/37/17/22/105/227 , Count: 1
Title: template:prc_admin/data/37/17/22/116/233 , Count: 1
Title: template:prc_admin/data/37/17/24/106/222 , Count: 1
Title: template:prc_admin/data/37/17/24/111/214 , Count: 1
Title: template:prc_admin/data/37/17/25/104/241 , 

In [None]:
from pyspark.sql import SparkSession
import time

# Create a SparkSession
spark = SparkSession.builder \
    .appName("PageTitlesAnalysis") \
    .getOrCreate()

# Read the dataset file as RDD
page_views_rdd = spark.sparkContext.textFile("pagecounts-20160101-000000_parsed.out")

# Measure the start time
start_time = time.time()

# Initialize an empty dictionary to store combined data for each page title
page_data = {}

# Loop through each line, split by whitespace, and extract the page title and other data
for line in page_views_rdd.collect():
    parts = line.split(" ")
    title = parts[1]
    data = (parts[2], parts[3])  # Assuming parts[2] and parts[3] represent additional data for the page
    if title in page_data:
        page_data[title].append(data)
    else:
        page_data[title] = [data]

# Save the combined data for each page title
for title, data_list in page_data.items():
    # Save or process the data as needed (e.g., write to a file)
    print("Page Title:", title)
    for data in data_list:
        print("Data:", data)

# Measure the end time
end_time = time.time()

# Print the execution time
print("Execution Time:", end_time-start_time)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Data: ('1', '9646')
Page Title: User_talk:Luenda
Data: ('1', '10139')
Page Title: User_talk:Luesingwen
Data: ('1', '9563')
Page Title: User_talk:Lukepun
Data: ('1', '9461')
Page Title: User_talk:Lululouis
Data: ('1', '7218')
Page Title: User_talk:Lunanaut
Data: ('1', '9304')
Page Title: User_talk:Luozihan
Data: ('1', '9279')
Page Title: User_talk:Lussok
Data: ('1', '10178')
Page Title: User_talk:Lwjthuwow
Data: ('1', '9292')
Page Title: User_talk:Ly9727
Data: ('1', '9268')
Page Title: User_talk:Lyan_liu
Data: ('1', '9460')
Page Title: User_talk:Lyliylytl
Data: ('2', '76517')
Page Title: User_talk:Lylylulu23
Data: ('1', '9324')
Page Title: User_talk:Lys_raven
Data: ('1', '9299')
Page Title: User_talk:Lzhencheng
Data: ('1', '9305')
Page Title: User_talk:Lzmxya
Data: ('1', '9906')
Page Title: User_talk:M0idv
Data: ('1', '6782')
Page Title: User_talk:M24992492
Data: ('1', '6809')
Page Title: User_talk:M464443128
Data: ('1', '