<a href="https://colab.research.google.com/github/SANJAI-834/Comprehensive-Security-Framework-On-Twitter-Data/blob/main/Comprehensive_Security_Framework_For_Twitter_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Function to read words from a file and return unique words
def read_unique_words(filename):
    unique_words = set()
    try:
        with open(filename, 'r', encoding='utf-8', errors='ignore') as file:
            for line in file:
                words = line.strip().split()
                unique_words.update(words)
    except FileNotFoundError:
        print(f"File '{filename}' not found.")
    except Exception as e:
        print(f"Error reading '{filename}': {str(e)}")
    return unique_words

# Function to save unique words to a file
def save_unique_words(unique_words, filename):
    try:
        with open(filename, 'w', encoding='utf-8') as file:
            for word in sorted(unique_words):
                file.write(word + '\n')
        print(f"Unique words saved to '{filename}'.")
    except IOError:
        print(f"Error: Could not write to file '{filename}'.")

# Main function to read 5 files and display unique words
def main():
    files = []
    print("Enter the paths of 5 files:")
    for i in range(5):
        file_path = input(f"File {i+1}: ")
        files.append(file_path)

    all_unique_words = set()
    for file in files:
        unique_words = read_unique_words(file)
        all_unique_words.update(unique_words)

    print("\nUnique words across all files:")
    for word in sorted(all_unique_words):
        print(word)

    # Save unique words to a file
    save_unique_words(all_unique_words, 'unique_words.txt')

if __name__ == "__main__":
    main()



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
purpose
purpose'
purpose,
pursue
pursuing?
pursuit
push
push,
push-ups
push?
pushed
pushes
pushing
put
puts
putting
puzzle
puzzle:
puzzled
puzzles
puzzles,
pyramid
p…
q
q's
q's!
q's:
quackery?
quadruple
quadruples
quadruplets
quake
quake.
quality
quality&gt;quantity
quality,
quality.
quality:
quantities
quantity
quantity,
quantity.
quarantine
quarantine,
quarantine.
quarantined
quarantines
quarantines,
quarantining
quarterbacks'
quasi-vegetarian
queasy:
queried
quest
question
question!
question,
question...
question.Remains
questioned
questioning
questionnaire
questions
questions!
questions'
questions,
questions.
questions/comments
questions:
quest…
queue
queues
quick
quick-deploy
quicker
quickly
quickly,
quiet
quietly
quinoa,
quirky
quit
quit'
quit,
quit--period."
quit:
quite
quits
quitting
quitting'
quitting,
quiz
quiz:
quizzed
quoted
quotes
q’s
r
rabbi
rabies
race
race!
racer
races
racey
racial
racing!
racism
racism?
r

In [None]:
def count_words(filename):
    with open(filename, 'r') as file:
        text = file.read()
        words = text.split()
        return len(words)

filename = '/content/sample_data/unique_words.txt'  # Replace with your file name
word_count = count_words(filename)
print(f"Word count: {word_count}")

Word count: 77225


In [None]:
import os
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_score
from joblib import Parallel, delayed

# Function to read the preprocessed text file
def read_preprocessed_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = file.readlines()
    return data

# Function to perform K-means clustering and determine optimal number of clusters
def kmeans_cluster_auto(data):
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(data).tocsc()  # Ensure data is writable by converting to CSC format

    def cluster_and_score(num_clusters):
        kmeans = MiniBatchKMeans(n_clusters=num_clusters, random_state=42)
        kmeans.fit(X)
        clusters = kmeans.labels_
        silhouette_avg = silhouette_score(X, clusters)
        return silhouette_avg, clusters, kmeans.cluster_centers_

    num_samples = X.shape[0]
    max_clusters = min(10, num_samples)  # Adjust max clusters to be at most the number of samples

    # Adjust the range as needed
    cluster_range = range(2, max_clusters + 1)
    results = Parallel(n_jobs=-1)(delayed(cluster_and_score)(num_clusters) for num_clusters in cluster_range)

    best_score, best_clusters, best_centroids = max(results, key=lambda x: x[0])
    feature_names = vectorizer.get_feature_names_out()

    best_num_clusters = cluster_range[results.index((best_score, best_clusters, best_centroids))]
    return best_clusters, best_centroids, feature_names, best_num_clusters

# Function to display clustered output
def display_clusters(data, clusters, feature_names):
    clustered_data = {}
    for i, cluster_id in enumerate(clusters):
        if cluster_id not in clustered_data:
            clustered_data[cluster_id] = []
        clustered_data[cluster_id].append(data[i].strip())

    for cluster_id, items in clustered_data.items():
        print(f"Cluster {cluster_id + 1}:")
        print("\n".join(items))
        print("-----------------------")

# Function to save clustered output to a text file
def save_clusters_to_file(output_file, data, clusters):
    with open(output_file, 'w', encoding='utf-8') as file:
        clustered_data = {}
        for i, cluster_id in enumerate(clusters):
            if cluster_id not in clustered_data:
                clustered_data[cluster_id] = []
            clustered_data[cluster_id].append(data[i].strip())

        for cluster_id, items in clustered_data.items():
            file.write(f"Cluster {cluster_id + 1}:\n")
            file.write("\n".join(items))
            file.write("\n-----------------------\n")

# Example usage
if __name__ == "__main__":
    # Replace this with your preprocessed text file path
    input_file = input("Enter the path to your preprocessed text file: ").strip()
    if not os.path.isfile(input_file):
        print("File not found.")
        exit()

    # Read the preprocessed text file
    data = read_preprocessed_file(input_file)

    # Perform K-means clustering and determine optimal number of clusters
    clusters, centroids, feature_names, num_clusters = kmeans_cluster_auto(data)

    # Display clustered output
    display_clusters(data, clusters, feature_names)

    # Save clustered output to a text file
    output_file = input("Enter the path to save the clustered output as a text file: ").strip()
    save_clusters_to_file(output_file, data, clusters)

    print(f"Clustered output saved to {output_file}")
    print(f"Optimal number of clusters determined automatically: {num_clusters}")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
502257096167931904|Thu
502257096834809857|Thu
502367601394061312|Thu
502384167103901696|Thu
502400533471047680|Thu
502407946689470464|Thu
502416168380923904|Thu
502418274340978688|Thu
502418379471618048|Thu
502434889333874689|Thu
502450981867499520|Thu
502468268670865408|Thu
502478542513664000|Thu
502479268052729856|Thu
502482001828790272|Thu
502482117855436800|Thu
502482118551670784|Thu
502484246683213825|Thu
502486739618119682|Thu
502499103784591361|Thu
502505362642067457|Thu
502533601410711553|Thu
502536357365809152|Thu
502547596855246849|Thu
502548795130142720|Thu
502554518333440000|Thu
502554520271216640|Thu
502573949780508672|Thu
502589093545771009|Thu
502603293924003843|Thu
502603295710777344|Thu
504793379473858560|Thu
504821324498223104|Thu
504881475024461825|Thu
504926732516741120|Thu
504927212424802305|Thu
504935373235961856|Thu
504941359099162624|Thu
504943407551090689|Thu
504960128173764608|Thu
504974163518709

FileNotFoundError: [Errno 2] No such file or directory: ''

In [None]:
import os
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_score
from joblib import Parallel, delayed

# Function to read the preprocessed text file
def read_preprocessed_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = file.readlines()
    return data

# Function to perform K-means clustering and determine optimal number of clusters
def kmeans_cluster_auto(data):
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(data).tocsc()  # Ensure data is writable by converting to CSC format

    def cluster_and_score(num_clusters):
        kmeans = MiniBatchKMeans(n_clusters=num_clusters, random_state=42)
        kmeans.fit(X)
        clusters = kmeans.labels_
        silhouette_avg = silhouette_score(X, clusters)
        return silhouette_avg, clusters, kmeans.cluster_centers_

    num_samples = X.shape[0]
    max_clusters = min(10, num_samples)  # Adjust max clusters to be at most the number of samples

    # Adjust the range as needed
    cluster_range = range(2, max_clusters + 1)
    results = Parallel(n_jobs=-1)(delayed(cluster_and_score)(num_clusters) for num_clusters in cluster_range)

    best_score, best_clusters, best_centroids = max(results, key=lambda x: x[0])
    feature_names = vectorizer.get_feature_names_out()

    best_num_clusters = cluster_range[results.index((best_score, best_clusters, best_centroids))]
    return best_clusters, best_centroids, feature_names, best_num_clusters

# Function to display clustered output
def display_clusters(data, clusters, feature_names):
    clustered_data = {}
    for i, cluster_id in enumerate(clusters):
        if cluster_id not in clustered_data:
            clustered_data[cluster_id] = []
        clustered_data[cluster_id].append(data[i].strip())

    for cluster_id, items in clustered_data.items():
        print(f"Cluster {cluster_id + 1}:")
        print("\n".join(items))
        print("-----------------------")

# Function to save clustered output to a text file
def save_clusters_to_file(output_file, data, clusters):
    with open(output_file, 'w', encoding='utf-8') as file:
        clustered_data = {}
        for i, cluster_id in enumerate(clusters):
            if cluster_id not in clustered_data:
                clustered_data[cluster_id] = []
            clustered_data[cluster_id].append(data[i].strip())

        for cluster_id, items in clustered_data.items():
            file.write(f"Cluster {cluster_id + 1}:\n")
            file.write("\n".join(items))
            file.write("\n-----------------------\n")

# Example usage
if __name__ == "__main__":
    # Replace this with your preprocessed text file path
    input_file = input("Enter the path to your preprocessed text file: ").strip()
    if not os.path.isfile(input_file):
        print("File not found.")
        exit()

    # Read the preprocessed text file
    data = read_preprocessed_file(input_file)

    # Perform K-means clustering and determine optimal number of clusters
    clusters, centroids, feature_names, num_clusters = kmeans_cluster_auto(data)

    # Display clustered output
    display_clusters(data, clusters, feature_names)

    # Prompt for output file path and save clustered output
    output_file = input("Enter the path to save the clustered output as a text file: ").strip()
    if output_file:
        save_clusters_to_file(output_file, data, clusters)
        print(f"Clustered output saved to {output_file}")
    else:
        print("No output file path provided. Clustered output not saved.")

    print(f"Optimal number of clusters determined automatically: {num_clusters}")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
502367601394061312|Thu
502384167103901696|Thu
502400533471047680|Thu
502407946689470464|Thu
502416168380923904|Thu
502418274340978688|Thu
502418379471618048|Thu
502434889333874689|Thu
502450981867499520|Thu
502468268670865408|Thu
502478542513664000|Thu
502479268052729856|Thu
502482001828790272|Thu
502482117855436800|Thu
502482118551670784|Thu
502484246683213825|Thu
502486739618119682|Thu
502499103784591361|Thu
502505362642067457|Thu
502533601410711553|Thu
502536357365809152|Thu
502547596855246849|Thu
502548795130142720|Thu
502554518333440000|Thu
502554520271216640|Thu
502573949780508672|Thu
502589093545771009|Thu
502603293924003843|Thu
502603295710777344|Thu
504793379473858560|Thu
504821324498223104|Thu
504881475024461825|Thu
504926732516741120|Thu
504927212424802305|Thu
504935373235961856|Thu
504941359099162624|Thu
504943407551090689|Thu
504960128173764608|Thu
504974163518709760|Thu
504974199564550144|Thu
504985077689106