In [None]:
# Data manipulation and analysis
import pandas as pd
import numpy as np

# For downloading files from the web
import requests

# To work with JSON data
import json

# For file system operations (optional, but useful)
import os

# For data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# For working with dates and times if needed
from datetime import datetime


# Project: Wrangling and Analyze Data


## Data Gathering
In the cell below, gather **all** three pieces of data for this project and load them in the notebook. **Note:** the methods required to gather each data are different.
1. Directly download the WeRateDogs Twitter archive data 
(twitter_archive_enhanced.csv)

In [None]:
import pandas as pd

df_archive = pd.read_csv('twitter-archive-enhanced (5).csv')




2. Use the Requests library to download the tweet image prediction (image_predictions.tsv)

In [None]:
import pandas as pd
import requests

# Step 1: Download the TSV file from Udacity's URL
url = "https://d17h27t6h515a5.cloudfront.net/topher/2017/November/5a0cbb4d_image-predictions/image-predictions.tsv"
response = requests.get(url)

# Step 2: Save the file locally
with open('image-predictions.tsv', 'wb') as f:
    f.write(response.content)

# Step 3: Load the TSV file into a DataFrame
df_image = pd.read_csv('image-predictions.tsv', sep='\t')

# Step 4: Check the structure
print(df_image.info())


3. Use the Tweepy library to query additional data via the Twitter API (tweet_json.txt)

In [None]:
!unzip tweet-json.zip



In [None]:
import pandas as pd
import json

# Create an empty list to hold each tweet's data
tweet_list = []

# Open the tweet-json.txt file and read line by line
with open('tweet-json.txt', 'r') as file:
    for line in file:
        tweet = json.loads(line)  # Parse each JSON line into a dict
        tweet_list.append({
            'tweet_id': str(tweet['id']),
            'retweet_count': tweet['retweet_count'],
            'favorite_count': tweet['favorite_count']
        })

# Convert the list of dicts into a DataFrame
df_tweet = pd.DataFrame(tweet_list)

# Optional: Preview the data
print(df_tweet.head())



## Assessing Data
In this section, detect and document at least **eight (8) quality issues and two (2) tidiness issue**. You must use **both** visual assessment
programmatic assessement to assess the data.

**Note:** pay attention to the following key points when you access the data.

* You only want original ratings (no retweets) that have images. Though there are 5000+ tweets in the dataset, not all are dog ratings and some are retweets.
* Assessing and cleaning the entire dataset completely would require a lot of time, and is not necessary to practice and demonstrate your skills in data wrangling. Therefore, the requirements of this project are only to assess and clean at least 8 quality issues and at least 2 tidiness issues in this dataset.
* The fact that the rating numerators are greater than the denominators does not need to be cleaned. This [unique rating system](http://knowyourmeme.com/memes/theyre-good-dogs-brent) is a big part of the popularity of WeRateDogs.
* You do not need to gather the tweets beyond August 1st, 2017. You can, but note that you won't be able to gather the image predictions for these tweets since you don't have access to the algorithm used.



## Assessing Data
In this section, detect and document at least **eight (8) quality issues and two (2) tidiness issue**. You must use **both** visual assessment
programmatic assessement to assess the data.

**Note:** pay attention to the following key points when you access the data.

* You only want original ratings (no retweets) that have images. Though there are 5000+ tweets in the dataset, not all are dog ratings and some are retweets.
* Assessing and cleaning the entire dataset completely would require a lot of time, and is not necessary to practice and demonstrate your skills in data wrangling. Therefore, the requirements of this project are only to assess and clean at least 8 quality issues and at least 2 tidiness issues in this dataset.
* The fact that the rating numerators are greater than the denominators does not need to be cleaned. This [unique rating system](http://knowyourmeme.com/memes/theyre-good-dogs-brent) is a big part of the popularity of WeRateDogs.
* You do not need to gather the tweets beyond August 1st, 2017. You can, but note that you won't be able to gather the image predictions for these tweets since you don't have access to the algorithm used.



In [None]:
import pandas as pd
import json

# Load the three datasets
df_archive = pd.read_csv('twitter-archive-enhanced (5).csv')
df_image = pd.read_csv('image-predictions.tsv', sep='\t')

tweet_json = []
with open('tweet-json.txt', 'r') as file:
    for line in file:
        tweet_json.append(json.loads(line))
df_tweet_json = pd.DataFrame(tweet_json)

# Show first 5 rows of each dataset (visual assessment)
print("Archive data sample:")
print(df_archive.head())

print("\nImage predictions sample:")
print(df_image.head())

print("\nTweet JSON data sample:")
print(df_tweet_json.head())

# Programmatic assessment - quick checks

print("\nMissing values in Archive data:")
print(df_archive.isnull().sum())

print("\nNumber of retweets (should remove these):")
print(df_archive['retweeted_status_id'].notnull().sum())

print("\nTweets without images:")
print(df_archive['expanded_urls'].isnull().sum())

print("\nUnique dog names (some may be wrong):")
print(df_archive['name'].value_counts().tail(10))

print("\nRating denominator values:")
print(df_archive['rating_denominator'].value_counts())

print("\nDog stage columns example:")
print(df_archive[['doggo', 'floofer', 'pupper', 'puppo']].sample(5))


### Quality issues
1. Retweets exist in `twitter_archive`, but we only want original tweets.
2. Some tweets have no images (missing `expanded_urls`).
3. Invalid dog names like “a”, “an”, “the”, “None”, or lowercase words in `name`.
4. `timestamp` column is in string format, not datetime.
5. `source` column contains HTML, not readable platform names.
6. Ratings have `rating_denominator` not equal to 10.
7. Inconsistent capitalization in image prediction names (e.g., “golden_retriever” vs “Golden_retriever”).
8. Low confidence values in image predictions (e.g., `p1_conf` < 0.5).

### Tidiness issues
1. `doggo`, `floofer`, `pupper`, and `puppo` should be one column: `dog_stage`.
2. The three datasets should be merged into one master dataset via `tweet_id`.

## Cleaning Data
In this section, clean **all** of the issues you documented while assessing. 

**Note:** Make a copy of the original data before cleaning. Cleaning includes merging individual pieces of data according to the rules of [tidy data](https://cran.r-project.org/web/packages/tidyr/vignettes/tidy-data.html). The result should be a high-quality and tidy master pandas DataFrame (or DataFrames, if appropriate).

In [None]:
# Make copies of original dataframes
df_archive_clean = df_archive.copy()
df_image_clean = df_image.copy()
df_tweet_clean = df_tweet.copy()



### Issue #1:Remove Retweets (Quality)

Define: Remove all retweets to keep only original tweets.

#### Code

In [None]:
df_archive_clean = df_archive_clean[df_archive_clean['retweeted_status_id'].isnull()]


#### Test

In [None]:
print(df_archive_clean['retweeted_status_id'].isnull().sum())  

### Issue #2:Remove Tweets without Images (Quality)

#### Define: Keep only tweets that contain image URLs.

#### Code

In [None]:
df_archive_clean = df_archive_clean[df_archive_clean['expanded_urls'].notnull()]

#### Test

In [None]:
print(df_archive_clean['expanded_urls'].isnull().sum())


### Issue #3: Fix Incorrect Dog Names (Quality)

#### Define: Replace invalid dog names like 'a', 'an', 'the', 'None', etc., with NaN.

#### Code

In [None]:
import numpy as np

invalid_names = ['a', 'an', 'the', 'None', 'very', 'just', 'my', 'like', 'unacceptable', 'officially', 'this']
df_archive_clean['name'] = df_archive_clean['name'].apply(lambda x: np.nan if x in invalid_names else x)


#### Test

In [None]:
print(df_archive_clean['name'].value_counts().tail(10))  # Check last 10 valid dog names


### Issue #4:Convert Timestamp Column to datetime (Quality)

#### Define:Convert timestamp to pandas datetime type

#### Code

In [None]:
df_archive_clean['timestamp'] = pd.to_datetime(df_archive_clean['timestamp'])


#### Test

In [None]:
print(df_archive_clean['timestamp'].dtype) 


### Issue #5: Fix Data Types in tweet_json Data (Quality)

#### Define:Ensure id is string and counts are integers.

#### Code

In [None]:
df_tweet_clean = df_tweet.copy()
df_tweet_clean['id'] = df_tweet_clean['id'].astype(str)
df_tweet_clean['retweet_count'] = df_tweet_clean['retweet_count'].astype(int)
df_tweet_clean['favorite_count'] = df_tweet_clean['favorite_count'].astype(int)


#### Test

In [None]:
print(df_tweet_clean.dtypes)


### Issue #6:Drop Duplicate Tweet IDs (Quality)


#### Define: Remove duplicate entries by tweet ID.

#### Code

In [None]:
df_archive_clean = df_archive_clean.drop_duplicates(subset=['tweet_id'])


#### Test

In [None]:
print(df_archive_clean['tweet_id'].duplicated().sum())


### Issue #7:Filter Tweets Before August 1, 2017 (Quality)

#### Define: Keep only tweets with timestamp before August 1, 2017.


#### Code

In [None]:
df_archive_clean = df_archive_clean[df_archive_clean['timestamp'] < '2017-08-01']


#### Test

In [None]:
df_archive_clean = df_archive_clean[df_archive_clean['timestamp'] < '2017-08-01']



### Issue #8:Fix Rating Numerator and Denominator Data Types (Quality)

#### Define:Ensure numerator and denominator are numeric.

#### Code

In [None]:
# Convert rating_numerator and rating_denominator to float
df_archive_clean['rating_numerator'] = df_archive_clean['rating_numerator'].astype(float)
df_archive_clean['rating_denominator'] = df_archive_clean['rating_denominator'].astype(float)


#### Test

In [None]:
print(df_archive_clean[['rating_numerator', 'rating_denominator']].dtypes)


### Issue #9:Merge DataFrames on Tweet ID (Tidiness)

#### Define:Merge tweet archive, tweet JSON, and image predictions into one DataFrame.

#### Code

In [None]:
# After splitting expanded_urls
df_archive_clean['expanded_urls'] = df_archive_clean['expanded_urls'].astype(str)
df_archive_clean['expanded_urls'] = df_archive_clean['expanded_urls'].str.split(',')

expanded_urls_list = []
tweet_ids_list = []

for index, row in df_archive_clean.iterrows():
    urls = row['expanded_urls']
    if urls and len(urls) > 0:
        for url in urls:
            tweet_ids_list.append(row['tweet_id'])
            expanded_urls_list.append(url)
    else:
        # Handle rows with empty or NaN expanded_urls
        tweet_ids_list.append(row['tweet_id'])
        expanded_urls_list.append(None)  # Or '' if you prefer

if len(tweet_ids_list) > 0 and len(expanded_urls_list) > 0:
    df_expanded_urls = pd.DataFrame({
        'tweet_id': tweet_ids_list,
        'expanded_urls': expanded_urls_list
    })
else:
    # If no data, create empty DataFrame with these columns to avoid errors later
    df_expanded_urls = pd.DataFrame(columns=['tweet_id', 'expanded_urls'])

# Continue merging as before
df_other_cols = df_archive_clean.drop(columns=['expanded_urls'])

df_clean_expanded = pd.merge(df_expanded_urls, df_other_cols, on='tweet_id', how='left')

print(df_clean_expanded.head())




#### Test



In [None]:
# Test: Check that 'expanded_urls' column is properly exploded into multiple rows

# Check how many rows have null or None expanded_urls now
null_expanded_urls_count = df_clean_expanded['expanded_urls'].isnull().sum()
print(f"Number of rows with null expanded_urls: {null_expanded_urls_count}")

# Check total unique tweet_ids to make sure no duplicates lost
unique_tweet_ids = df_clean_expanded['tweet_id'].nunique()
print(f"Number of unique tweet_ids after expansion: {unique_tweet_ids}")

# Display first 5 rows to verify structure and content
print(df_clean_expanded.head())


### Issue #10: Inconsistent datatype of timestamp column

#### Define:Combine doggo, floofer, pupper, puppo columns into one dog_stage column.

#### Code

In [None]:
# Replace 'None' with np.nan in dog stage columns
dog_stage_cols = ['doggo', 'floofer', 'pupper', 'puppo']
for col in dog_stage_cols:
    if col in df_archive.columns:
        df_archive[col] = df_archive[col].replace('None', np.nan)

# Combine columns if they exist
if all(col in df_archive.columns for col in dog_stage_cols):
    df_archive['dog_stage'] = df_archive[dog_stage_cols].apply(
        lambda row: ', '.join(row.dropna()), axis=1)
    df_archive['dog_stage'].replace('', np.nan, inplace=True)
    df_archive['dog_stage'].replace({
        'doggo, pupper': 'doggo and pupper',
        'doggo, puppo': 'doggo and puppo',
        'doggo, floofer': 'doggo and floofer'
    }, inplace=True)
    df_archive.drop(columns=dog_stage_cols, inplace=True)




#### Test

In [None]:
print(df_archive['dog_stage'].value_counts(dropna=False))



## Storing Data
Save gathered, assessed, and cleaned master dataset to a CSV file named "twitter_archive_master.csv".

In [None]:
# Copy original data
df_archive_clean = df_archive.copy()
df_image_clean = df_image.copy()
df_tweet_clean = df_tweet.copy()

# Cleaning (make sure to clean all issues you listed!)
# Example: remove retweets
df_archive_clean = df_archive_clean[df_archive_clean['retweeted_status_id'].isnull()]

# Make sure tweet_id is string in all for merging
df_archive_clean['tweet_id'] = df_archive_clean['tweet_id'].astype(str)
df_image_clean['tweet_id'] = df_image_clean['tweet_id'].astype(str)
df_tweet_clean['tweet_id'] = df_tweet_clean['tweet_id'].astype(str)

# Merge cleaned DataFrames
df_master = pd.merge(df_archive_clean, df_tweet_clean, on='tweet_id', how='left')
df_master = pd.merge(df_master, df_image_clean, on='tweet_id', how='left')

# Save final cleaned master dataset
df_master.to_csv('twitter_archive_master.csv', index=False)


In [None]:

# Copy original data
df_archive_clean = df_archive.copy()
df_image_clean = df_image.copy()
df_tweet_clean = df_tweet.copy()

# Cleaning (make sure to clean all issues you listed!)
# Example: remove retweets
df_archive_clean = df_archive_clean[df_archive_clean['retweeted_status_id'].isnull()]

# Make sure tweet_id is string in all for merging
df_archive_clean['tweet_id'] = df_archive_clean['tweet_id'].astype(str)
df_image_clean['tweet_id'] = df_image_clean['tweet_id'].astype(str)
df_tweet_clean['tweet_id'] = df_tweet_clean['tweet_id'].astype(str)

# Merge cleaned DataFrames
df_master = pd.merge(df_archive_clean, df_tweet_clean, on='tweet_id', how='left')
df_master = pd.merge(df_master, df_image_clean, on='tweet_id', how='left')

# Save final cleaned master dataset
df_master.to_csv('twitter_archive_master.csv', index=False)


## Analyzing and Visualizing Data
In this section, analyze and visualize your wrangled data. You must produce at least **three (3) insights and one (1) visualization.**

In [None]:
df_check = pd.read_csv('twitter_archive_master.csv')
print(df_check.head())
print(df_check.info())


### Insights:
Insight 1: Tweets with the Highest Number of Favorites
By analyzing the favorite counts, it is clear that a few tweets received exceptionally high engagement compared to others. These tweets typically contain funny or heartwarming content and often feature dogs in unique or relatable situations. High favorite counts indicate popularity and positive reception by the audience.

Insight 2: Most Common Dog Stage
Among the dog stages such as "doggo", "pupper", "puppo", and "floofer", the most frequently occurring stage was identified. "Pupper" appears to be the most common stage in the dataset, reflecting a general user tendency to share photos and rate younger dogs more often.

Insight 3: Correlation Between Retweets and Favorites
There is a strong positive correlation between the number of retweets and the number of favorites a tweet receives. This suggests that popular tweets not only get liked but also shared widely. Therefore, higher engagement overall is a sign of quality or viral content.



### Visualization

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(10, 6))
plt.scatter(df_master['retweet_count'], df_master['favorite_count'], alpha=0.5)
plt.title('Retweets vs Favorites')
plt.xlabel('Retweet Count')
plt.ylabel('Favorite Count')
plt.grid(True)
plt.show()

