In [2]:
# Packages Installation
import pandas as pd
from transformers import Trainer, TrainingArguments, T5ForConditionalGeneration, T5Tokenizer
from datasets import load_metric
from sklearn.model_selection import train_test_split
import re

  from .autonotebook import tqdm as notebook_tqdm


##### Fetching Metadata from CSV

In [3]:
Metadata = pd.read_csv("E:\\Data Science\\Research Project\\Research_Project\\Rough Scripts\\MetaData_Cleaned_DS.csv")

##### Preparing the data

In [4]:
def preprocess_metadata(metadata):
    # Clean text data (e.g., remove special characters, lowercase)
    cleaned_metadata = metadata.copy() 
    cleaned_metadata['description'] = cleaned_metadata['description'].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', str(x)))
    cleaned_metadata['description'] = cleaned_metadata['description'].str.lower()
    cleaned_metadata['title'] = cleaned_metadata['title'].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', str(x)))
    cleaned_metadata['title'] = cleaned_metadata['title'].str.lower()
    cleaned_metadata['summary'] = cleaned_metadata['summary'].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', str(x)))
    cleaned_metadata['summary'] = cleaned_metadata['summary'].str.lower()

    
    # Tokenize text data (split into words)
    cleaned_metadata['description'] = cleaned_metadata['description'].str.split()        
    cleaned_metadata['title'] = cleaned_metadata['title'].str.split()        
    cleaned_metadata['summary'] = cleaned_metadata['summary'].str.split()

    # Handle missing values (e.g., fill with a placeholder or remove rows)
    cleaned_metadata.fillna('', inplace=True)
    
    return cleaned_metadata

# Preprocess metadata
Preprocess_Metadata = preprocess_metadata(Metadata)

print("Preprocessed metadata:")
print(Preprocess_Metadata.head())


Preprocessed metadata:
            owner                      id  \
0     data-ny-gov               thd2-fu8y   
1      markmarkoh        coronavirus-data   
2    chandrasekar          coronatracking   
3  george-t-stagg       cost-benchmarking   
4      crawlfeeds  fashion-images-dataset   

                                               title  \
0  [nyserda, electric, vehicle, drive, clean, reb...   
1                         [coronavirus, daily, data]   
2                                   [coronatracking]   
3                               [cost, benchmarking]   
4                          [fashion, image, dataset]   

                                         description  \
0  [new, york, state, charge, ny, initiative, off...   
1      [coronoavirus, covid19, data, updated, daily]   
2  [adding, public, datasets, related, to, corona...   
3  [cost, benchmarking, data, for, the, beverage,...   
4                 [fashion, product, image, dataset]   

                                

In [5]:
Preprocess_Metadata = preprocess_metadata(Metadata)


In [6]:
import ast  # Module for literal string evaluation

Preprocess_Metadata['files'] = Preprocess_Metadata['files'].apply(ast.literal_eval)


In [7]:
# Function to extract file formats
def extract_formats(files):
    formats = set()
    for file in files:
        filename = file['name']
        file_format = filename.split('.')[-1]
        formats.add(file_format)
    return list(formats)

    # Apply function to create new column
Preprocess_Metadata['available_formats'] = Preprocess_Metadata['files'].apply(extract_formats)


In [8]:
Preprocess_Metadata

Unnamed: 0,owner,id,title,description,summary,version,tags,license,visibility,files,...,created,updated,accessLevel,versionDois,isProject,assetStatus,properties,dois,dataset_url,available_formats
0,data-ny-gov,thd2-fu8y,"[nyserda, electric, vehicle, drive, clean, reb...","[new, york, state, charge, ny, initiative, off...","[original, title, nyserda, electric, vehicle, ...",dab0adef-3132-420e-96da-46d5ed67fd12,ev electric vehicle bev phev ghg drive clean r...,other,open,[{'name': 'nyserda-electric-vehicle-drive-clea...,...,2020-05-07T10:40:12,2020-10-08T07:21:12,read,[],False,,,,https://data.world/data-ny-gov/thd2-fu8y,[csv]
1,markmarkoh,coronavirus-data,"[coronavirus, daily, data]","[coronoavirus, covid19, data, updated, daily]","[editormarkdown, originally, sourced, from, sy...",30a6f17d-2090-48c3-8150-dcc07579d781,coronavirus health virus flu,,open,"[{'name': 'full_data.csv', 'sizeInBytes': 1654...",...,2020-03-12T02:55:16,2020-12-04T15:15:28,read,[],False,,,,https://data.world/markmarkoh/coronavirus-data,[csv]
2,chandrasekar,coronatracking,[coronatracking],"[adding, public, datasets, related, to, corona...","[editorsimple, all, data, obtained, from, the,...",55a57c94-3c0c-46fc-b05f-f2222620f767,corona virus covid19 covid 19,,open,"[{'name': 'COVID-19 Cases_tableau.csv', 'sizeI...",...,2020-03-17T07:59:42,2020-03-19T08:14:11,read,[],True,,,,https://data.world/chandrasekar/coronatracking,[csv]
3,george-t-stagg,cost-benchmarking,"[cost, benchmarking]","[cost, benchmarking, data, for, the, beverage,...",[nan],4adbb32d-8c55-4fbf-8f70-fcb960b8f0b4,,,discoverable,[{'name': 'Situation Recort Army File (SITRA);...,...,2020-06-11T02:05:42,2021-05-14T05:29:05,discover,[],False,"{'assetStatusLabel': 'Approved', 'dispositionL...",{'Impact Potential': 'Impact Testo Michelada'},[],https://data.world/george-t-stagg/cost-benchma...,"[csv, xlsx]"
4,crawlfeeds,fashion-images-dataset,"[fashion, image, dataset]","[fashion, product, image, dataset]","[editorsimple, fashion, product, image, datase...",83f4476b-6a14-494d-823e-9a94b524c064,imagesdataset fashion image dataset image dataset,public domain,open,"[{'name': 'fashion_product_images.zip', 'sizeI...",...,2021-09-19T06:58:25,2022-08-09T20:08:10,read,[],False,,,[],https://data.world/crawlfeeds/fashion-images-d...,[zip]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64,sports,recognized-sports,"[recognized, sport]",[nan],"[here, is, a, list, of, international, sport, ...",bbe67446-52cf-4668-aa13-4cf2650f8eb4,sport official sport list of sport type of sport,,open,"[{'name': 'recognized_sports.xlsx', 'sizeInByt...",...,2017-04-14T19:16:03,2017-12-20T19:28:57,read,[],False,,,,https://data.world/sports/recognized-sports,[xlsx]
65,crowdflower,sports-illustrated-covers,"[sport, illustrated, cover]",[nan],"[a, data, set, listing, the, sport, that, have...",756db1e6-2b6e-4827-869d-907036b57ebe,crowdsourced sport magazine cover image photo ...,public domain,open,"[{'name': 'SI-Cover-by-Sport-DFE.csv', 'sizeIn...",...,2016-11-21T23:43:16,2016-11-21T23:51:17,read,[],False,,,,https://data.world/crowdflower/sports-illustra...,[csv]
66,the-pudding,winningest-cities-in-sports,"[winningest, city, in, sport]","[data, for, every, championship, win, in, six,...","[editormarkdown, about, this, dataset, contain...",e6a36707-1abc-41e8-9ff4-6f9ba76b67d1,sport championship ranking basketball football...,other,open,"[{'name': 'case1.json', 'sizeInBytes': 126097,...",...,2019-03-10T19:54:14,2021-07-15T01:42:48,read,[],False,,,[],https://data.world/the-pudding/winningest-citi...,"[json, csv]"
67,datagov-uk,17c44e3a-804c-487b-a07f-b90298685e2a,"[thurrock, outdoor, sport]","[thurrock, outdoor, sport]","[this, dataset, show, the, location, of, outdo...",21708327-0e5c-4725-8bba-e4f616c8454c,,ogl,open,"[{'name': 'tc-outdoor-sports-1.xml', 'sizeInBy...",...,2021-08-26T08:50:00,2021-08-26T08:50:06,read,[],False,,,,https://data.world/datagov-uk/17c44e3a-804c-48...,[xml]


In [9]:
dataset = Preprocess_Metadata.copy()

In [10]:
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

def preprocess_text(text):
    if isinstance(text, list):
        # Convert list of strings to a single string
        text = ' '.join(text)
    
    # Convert text to lowercase
    text = text.lower()
    
    # Tokenize text
    tokens = word_tokenize(text)
    
    # Remove punctuation and non-alphabetic characters, and single character tokens
    tokens = [re.sub(r'[^a-zA-Z]', '', token) for token in tokens if token.isalpha() and len(token) > 1]
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Join tokens back into a single string
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text


In [11]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

dataset = Preprocess_Metadata.copy()

# Apply preprocessing to text columns
for column in ['title', 'description', 'summary']:
    dataset[column] = dataset[column].apply(preprocess_text)

# Prepare the User Input
user_input = "Corona Virus Pandemic"
preprocessed_user_input = preprocess_text(user_input)

# Generate Relevance Scores
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(dataset['title'] + ' ' + dataset['description'] + ' ' + dataset['summary'])
user_input_vector = tfidf_vectorizer.transform([preprocessed_user_input])
cosine_similarities = cosine_similarity(user_input_vector, tfidf_matrix)

# Get indices of top 5 relevant dataset URLs
top_indices = cosine_similarities.argsort()[0][-5:][::-1]

# Get top 5 relevant dataset URLs and their similarity scores
top_dataset_urls = dataset.iloc[top_indices]['dataset_url']
top_similarity_scores = cosine_similarities[0][top_indices]

print("Top 5 relevant dataset URLs with similarity scores:")
for url, score in zip(top_dataset_urls, top_similarity_scores):
    print(f"URL: {url}, Similarity Score: {score}")

Top 5 relevant dataset URLs with similarity scores:
URL: https://data.world/chandrasekar/coronatracking, Similarity Score: 0.2134654546451647
URL: https://data.world/rtekumalla1/a-twitter-dataset-of-100-million-tweets-related-to-covid-19, Similarity Score: 0.0693991457338
URL: https://data.world/kongkei/buy-wireless-security-cameras-at-affordable-prices, Similarity Score: 0.047899412887408045
URL: https://data.world/zendoll27/trends-in-youth-sports, Similarity Score: 0.0
URL: https://data.world/healthdatany/h55x-hu6n, Similarity Score: 0.0


In [12]:
    import pandas as pd
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics.pairwise import cosine_similarity

    dataset = Preprocess_Metadata.copy()

    # Apply preprocessing to text columns
    for column in ['title', 'description', 'summary']:
        dataset[column] = dataset[column].apply(preprocess_text)

    # Prepare the User Input
    user_input = "Expenditure"
    preprocessed_user_input = preprocess_text(user_input)

    # Generate Relevance Scores
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(dataset['title'] + ' ' + dataset['description'] + ' ' + dataset['summary'])
    user_input_vector = tfidf_vectorizer.transform([preprocessed_user_input])
    cosine_similarities = cosine_similarity(user_input_vector, tfidf_matrix)

    # Define similarity threshold
    similarity_threshold = 0.06

    # Get indices of relevant dataset URLs above the threshold
    relevant_indices = [i for i, score in enumerate(cosine_similarities[0]) if score > similarity_threshold]

    # Get relevant dataset URLs and their similarity scores
    relevant_dataset_urls = dataset.iloc[relevant_indices]['dataset_url']
    relevant_similarity_scores = cosine_similarities[0][relevant_indices]

    print("Relevant dataset URLs above similarity threshold:")
    for url, score in zip(relevant_dataset_urls, relevant_similarity_scores):
        print(f"URL: {url}, Similarity Score: {score}")


Relevant dataset URLs above similarity threshold:
URL: https://data.world/datagov-uk/3768fd01-2fa1-4514-82d4-ced4aad3e3e5, Similarity Score: 0.4380564269264124


In [13]:
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

def preprocess_text_ignoring_common_words(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Tokenize text
    tokens = word_tokenize(text)
    
    # Remove punctuation, non-alphabetic characters, and single character tokens
    tokens = [re.sub(r'[^a-zA-Z]', '', token) for token in tokens if token.isalpha() and len(token) > 1]
    
    # Remove stopwords and common words
    stop_words = set(stopwords.words('english'))
    common_words = set(['datasets', 'data','repo','repositories','sources','data sources','database'])  # Define common words to remove
    tokens = [token for token in tokens if token not in stop_words and token not in common_words]
    
    # Lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Join tokens back into a single string
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text


In [14]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

dataset = Preprocess_Metadata.copy()

# Apply preprocessing to text columns
for column in ['title', 'description', 'summary']:
    dataset[column] = dataset[column].apply(preprocess_text)

# Prepare the User Input
user_input = "Electric vehicles"
preprocessed_user_input = preprocess_text_ignoring_common_words(user_input)

# Generate Relevance Scores
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(dataset['title'] + ' ' + dataset['description'] + ' ' + dataset['summary'])
user_input_vector = tfidf_vectorizer.transform([preprocessed_user_input])
cosine_similarities = cosine_similarity(user_input_vector, tfidf_matrix)

# Define similarity threshold
similarity_threshold = 0.0

# Get indices of relevant dataset URLs above the threshold
relevant_indices = [i for i, score in enumerate(cosine_similarities[0]) if score > similarity_threshold]

# Get relevant dataset URLs and their similarity scores
relevant_dataset_urls = dataset.iloc[relevant_indices]['dataset_url']
relevant_similarity_scores = cosine_similarities[0][relevant_indices]

# Calculate min and max similarity scores
min_score = min(relevant_similarity_scores)
max_score = max(relevant_similarity_scores)

# Determine number of bands
num_outputs = len(relevant_dataset_urls)
num_bands = min(num_outputs, 3) 

# Divide similarity score range into bands
score_bands = [min_score + i * (max_score - min_score) / num_bands for i in range(num_bands)] + [max_score]

# Sort relevant dataset URLs and similarity scores by similarity score in descending order
sorted_indices = relevant_similarity_scores.argsort()[::-1]
sorted_relevant_dataset_urls = relevant_dataset_urls.iloc[sorted_indices]
sorted_relevant_similarity_scores = relevant_similarity_scores[sorted_indices]

print("Relevant dataset URLs above similarity threshold (arranged in bands based on similarity score):")
for i in reversed(range(num_bands)):
    band_start = score_bands[i]
    band_end = score_bands[i + 1]
    print(f"\nBand {i+1}: Similarity Score Range: {band_start:.2f} - {band_end:.2f}\n")
    for url, score in zip(sorted_relevant_dataset_urls, sorted_relevant_similarity_scores):
        if band_start <= score <= band_end:
            index = dataset[dataset['dataset_url'] == url].index[0]  # Find index of dataset
            available_formats = dataset.at[index, 'available_formats']  # Get available formats
            print(f"URL: {url}, Similarity Score: {score}, Available Formats: {available_formats}")



Relevant dataset URLs above similarity threshold (arranged in bands based on similarity score):

Band 3: Similarity Score Range: 0.27 - 0.34

URL: https://data.world/nhtsa/cars-purchased-vehicles-xls, Similarity Score: 0.3445352265372441, Available Formats: ['zip', 'xls']
URL: https://data.world/data-ny-gov/thd2-fu8y, Similarity Score: 0.3312246655689556, Available Formats: ['csv']

Band 2: Similarity Score Range: 0.19 - 0.27


Band 1: Similarity Score Range: 0.12 - 0.19

URL: https://data.world/us-epa-gov/124206c8-26a9-435b-9f6c-7cb0d830fbc2, Similarity Score: 0.1165826517390954, Available Formats: ['xml']
