In [None]:
# coding:gbk
import jieba
import gensim
import re
import os

jieba.setLogLevel(jieba.logging.INFO)

# 获取指定路径的文件内容
def get_file_contents(path):
    str = ''
    f = open(path, 'r', encoding='UTF-8')
    line = f.readline()
    while line:
        str = str + line
        line = f.readline()
    f.close()
    return str

#将读取到的文件内容先进行jieba分词，然后再把标点符号、转义符号等特殊符号过滤掉
def filter(str):
    str = jieba.lcut(str)
    result = []
    for tags in str:
        if (re.match(u"[a-zA-Z0-9\u4e00-\u9fa5]", tags)):
            result.append(tags)
        else:
            pass
    return result

# 忽略掉文本的语法和语序等要素，将其仅仅看作是若干个词汇的集合
def convert_corpus(text1,text2):
    texts=[text1,text2]
    dictionary = gensim.corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    return corpus

#传入过滤之后的数据，通过调用gensim.similarities.Similarity计算余弦相似度
def calc_similarity(text1,text2):
    texts=[text1,text2]
    dictionary = gensim.corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    similarity = gensim.similarities.Similarity('-Similarity-index', corpus, num_features=len(dictionary))
    test_corpus_1 = dictionary.doc2bow(text1)
    cosine_sim = similarity[test_corpus_1][1]
    return cosine_sim


if __name__ == '__main__':
    path1 = input("输入论文原文的文件的绝对路径：")
    path2 = input("输入抄袭版论文的文件的绝对路径：")
    if not os.path.exists(path1):
        print("论文原文文件不存在！")
        exit()
    if not os.path.exists(path2):
        print("抄袭版论文文件不存在！")
        exit()
    #输出结果绝对路径
    save_path ="C:\\Users\\lenovo\\Desktop\\copy\\result.txt"
    str1 = get_file_contents(path1)
    str2 = get_file_contents(path2)
    text1 = filter(str1)
    text2 = filter(str2)
    similarity = calc_similarity(text1, text2)
    print("文章相似度： %.2f"%similarity)
    #将相似度结果写入指定文件
    f = open(save_path, 'w', encoding="utf-8")
    f.write("python"+" "+"main.py"+" "+path1+" "+path2+" "+"文章相似度： %.2f"%similarity)
    f.close()

In [None]:
import jieba
import math
import re
#读入两个txt文件存入s1,s2字符串中
# s1 = open('../文本分析/text/2020.txt','r').read()
# s2 = open('../文本分析/text/2021.txt','r').read()
s1 = open('orig_0.8_del.txt','rb+').read()
s2 = open('orig_0.8_add.txt','rb+').read()

#利用jieba分词与停用词表，将词分好并保存到向量中
stopwords=[]
fstop=open('orig.txt','r',encoding='utf-8-sig')
for eachWord in fstop:
    eachWord = re.sub("\n", "", eachWord)
    stopwords.append(eachWord)
fstop.close()
s1_cut = [i for i in jieba.cut(s1, cut_all=True) if (i not in stopwords) and i!='']
s2_cut = [i for i in jieba.cut(s2, cut_all=True) if (i not in stopwords) and i!='']
word_set = set(s1_cut).union(set(s2_cut))

#用字典保存两篇文章中出现的所有词并编上号
word_dict = dict()
i = 0
for word in word_set:
    word_dict[word] = i
    i += 1


#根据词袋模型统计词在每篇文档中出现的次数，形成向量
s1_cut_code = [0]*len(word_dict)

for word in s1_cut:
    s1_cut_code[word_dict[word]]+=1

s2_cut_code = [0]*len(word_dict)
for word in s2_cut:
    s2_cut_code[word_dict[word]]+=1

# 计算余弦相似度
sum = 0
sq1 = 0
sq2 = 0
for i in range(len(s1_cut_code)):
    sum += s1_cut_code[i] * s2_cut_code[i]
    sq1 += pow(s1_cut_code[i], 2)
    sq2 += pow(s2_cut_code[i], 2)

try:
    result = round(float(sum) / (math.sqrt(sq1) * math.sqrt(sq2)), 3)
except ZeroDivisionError:
    result = 0.0
print("\n余弦相似度为：%f"%result)

In [None]:
import re
import jieba
import jieba.analyse
import numpy as np


def short_analyse(o_file, c_file):
    """
    :读取数据然后对数据进行一个jieba分词，同时根据正则匹配来过滤掉标点
    :param o_file: 原始论文的地址
    :param c_file: 抄袭论文的地址
    :return: 返回两个分词结果的列表
    """
    jieba.setLogLevel(jieba.logging.INFO)  # 使用中文词库来进行分词，防止报错
    o_list = []
    c_list = []
    try:
        with open(o_file, 'r', encoding='utf-8') as f:
            o_lines = f.readlines()
        for line in o_lines:
            pattern = re.compile(u"[^a-zA-Z0-9\u4e00-\u9fa5]")  # 正则匹配保留中文字符
            target = pattern.sub("", line)
            for data in jieba.lcut(target):
                o_list.append(data)
    except FileNotFoundError:
        print(f"{o_file}这个路径下没有文件")

    try:
        with open(c_file, 'r', encoding='utf-8') as f:
            c_lines = f.readlines()
        for line in c_lines:
            pattern = re.compile(u"[^a-zA-Z0-9\u4e00-\u9fa5]")  # 正则匹配保留中文字符
            target = pattern.sub("", line)
            for data in jieba.lcut(target):
                c_list.append(data)

    except FileNotFoundError:
        print(f"{c_file}这个路径下没有文件")

    all_words = list(set(o_list).union(set(c_list)))
    print(all_words)

    la = []
    lb = []
    # 转换为向量的形式
    for word in all_words:
        la.append(o_list.count(word))
        lb.append(c_list.count(word))

    # 计算余弦相似度
    laa = np.array(la)
    lbb = np.array(lb)
    cos = (np.dot(laa, lbb.T)) / ((np.sqrt(np.dot(laa, laa.T))) * (np.sqrt(np.dot(lbb, lbb.T))))
    print(f"两篇文章的相似度为{cos}")

def long_analyse(fname):
    try:
        with open(fname, 'r', encoding='utf-8') as f:
            content = f.read()
    except FileNotFoundError:
        print(f"{fname}这个路径下并不存在文件")

    tags = jieba.analyse.extract_tags(content, 10)
    return tags

def compute_sim(o_file, c_file):
    i = set(o_file).intersection(set(c_file))
    j = set(o_file).union((set(c_file)))
    return round(len(i)*100 / len(j), 2)

def ans(o_file, c_file):
    a1 = long_analyse(o_file)
    a2 = long_analyse(c_file)
    return compute_sim(a1, a2)

In [None]:
def main():
    o_file = 'orig.txt'
    c_file = 'orig_0.8_add.txt'
    
    # Call the short_analyse function
    short_analyse(o_file, c_file)
    
    # Call the ans function
    similarity = ans(o_file, c_file)
    print(f"The similarity between the papers is: {similarity}%")

if __name__ == '__main__':
    main()